]> AND Private Git Repository - predictops.git/blobdiff - predictops/learn/preprocessing.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Standardization and one hot encoding
[predictops.git] / predictops / learn / preprocessing.py
index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..4197b8fed13cb4137e33655753976532e42987a2 100644 (file)
@@ -1,9 +1,12 @@
 from configparser import ConfigParser
 from configparser import ConfigParser
+from csv import DictReader
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
+from os import listdir
 from pathlib import Path
 from pathlib import Path
+from sklearn import preprocessing
 
 import numpy as np
 import pandas as pd
 
 import numpy as np
 import pandas as pd
@@ -46,6 +49,23 @@ class Preprocessing:
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
+        feature_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : {'numerical': False} for feat in self._features}
+        for feature_file in listdir(feature_files):
+            if feature_file.endswith('csv'):
+                with open(feature_files / feature_file , "r") as f:
+                    reader = DictReader(f, delimiter=',')
+                    typed_names = {row['name']: row['type'] for row in reader}
+                for feature in self._features:
+                    if feature.split('_')[0] in typed_names:
+                        self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
+            elif feature_file.endswith('cfg'):
+                config = ConfigParser()
+                config.read(feature_files / feature_file)
+                for section in config:
+                    if config.has_option(section, 'numerical'):
+                        self._features[section]['numerical'] = config[section].getboolean('numerical')
+
 
 
     @property
 
 
     @property
@@ -115,6 +135,75 @@ class Preprocessing:
         return self._full_dict
 
 
         return self._full_dict
 
 
+    def _fill_nan(self):
+        '''
+        Fill NaN values, either by propagation or by interpolation (linear or splines)
+        '''
+        logger.info("Filling NaN numerical values in the feature dataframe")
+        # We interpolate (linearly or with splines) only numerical columns
+        numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
+                   or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
+        # The interpolation
+        if self._config['PREPROCESSING']['fill_method'] == 'propagate':
+            self._dataframe[numerical_columns] =\
+                self._dataframe[numerical_columns].fillna(method='ffill')
+        elif self._config['PREPROCESSING']['fill_method'] == 'linear':
+            self._dataframe[numerical_columns] =\
+                self._dataframe[numerical_columns].interpolate()
+        elif self._config['PREPROCESSING']['fill_method'] == 'spline':
+            self._dataframe[numerical_columns] =\
+                self._dataframe[numerical_columns].interpolate(method='spline',
+                     order=self._config['PREPROCESSING'].getint('order'))
+
+        # For the categorical columns, NaN values are filled by duplicating
+        # the last known value (forward fill method)
+        logger.info("Filling NaN categorical values in the feature dataframe")
+        categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
+                   or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+        self._dataframe[categorical_columns] =\
+            self._dataframe[categorical_columns].fillna(method='ffill')
+
+        # Uncomment this line to fill NaN values at the beginning of the
+        # dataframe. This may not be a good idea, especially for features
+        # that are available only for recent years, e.g., air quality
+        #self._dataframe = self._dataframe.fillna(method='bfill')
+
+        # Dropping rows that are not related to our datetime window (start/
+        # step / end)
+        self._dataframe = self._dataframe.drop([k.to_pydatetime()
+                                               for k in self._dataframe.T
+                                               if k not in self._datetimes])
+
+
+    def _standardize(self):
+        '''
+        Normalizing numerical features
+        '''
+        logger.info("Standardizing numerical values in the feature dataframe")
+        # We operate only on numerical columns
+        numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
+                   or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
+        self._dataframe[numerical_columns] = preprocessing.scale(self._dataframe[numerical_columns])
+
+
+    def _one_hot_encoding(self):
+        '''
+        Apply a one hot encoding for category features
+        '''
+        logger.info("One hot encoding for categorical feature")
+        categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
+                   or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+
+        # On fait un codage disjonctif complet des variables qualitatives
+        df_out = pd.DataFrame()
+        for col in categorical_columns:
+            pd1 = pd.get_dummies(self._dataframe[col],prefix=col)
+            for col1 in pd1.columns:
+                df_out[col1] = pd1[col1]
+        self._dataframe = df_out
+        print(self._dataframe.head())
+
+
 
     @property
     def dataframe(self):
 
     @property
     def dataframe(self):
@@ -125,20 +214,12 @@ class Preprocessing:
             logger.info("Creating feature dataframe from feature dictionary")
             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                      orient='index')
             logger.info("Creating feature dataframe from feature dictionary")
             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                      orient='index')
-            logger.info("Filling NaN values in the feature dataframe")
-
-            if self._config['PREPROCESSING']['fill_method'] == 'propagate':
-                self._dataframe = self._dataframe.fillna(method='ffill')
-            elif self._config['PREPROCESSING']['fill_method'] == 'linear':
-                self._dataframe = self._dataframe.interpolate()
-            elif self._config['PREPROCESSING']['fill_method'] == 'spline':
-                self._dataframe = self._dataframe.interpolate(method='spline',
-                                                              order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
-
-            self._dataframe = self._dataframe.drop([k.to_pydatetime()
-                                                   for k in self._dataframe.T
-                                                   if k not in self._datetimes])
+            # Dealing with NaN values
+            self._fill_nan()
+            # Normalizing numerical values
+            self._standardize()
+            # Dealing with categorical features
+            self._one_hot_encoding()
         return self._dataframe
 
 
         return self._dataframe