Standardization and one hot encoding

[predictops.git] / predictops / learn / preprocessing.py
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..4197b8fed13cb4137e33655753976532e42987a2 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,9 +1,12 @@
  from configparser import ConfigParser
  from configparser import ConfigParser
+from csv import DictReader
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
+from os import listdir
  from pathlib import Path
  from pathlib import Path
+from sklearn import preprocessing
  
  import numpy as np
  import pandas as pd
  
  import numpy as np
  import pandas as pd
@@ -46,6 +49,23 @@ class Preprocessing:
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
+        feature_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : {'numerical': False} for feat in self._features}
+        for feature_file in listdir(feature_files):
+            if feature_file.endswith('csv'):
+                with open(feature_files / feature_file , "r") as f:
+                    reader = DictReader(f, delimiter=',')
+                    typed_names = {row['name']: row['type'] for row in reader}
+                for feature in self._features:
+                    if feature.split('_')[0] in typed_names:
+                        self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
+            elif feature_file.endswith('cfg'):
+                config = ConfigParser()
+                config.read(feature_files / feature_file)
+                for section in config:
+                    if config.has_option(section, 'numerical'):
+                        self._features[section]['numerical'] = config[section].getboolean('numerical')
+
  
  
      @property
  
  
      @property
@@ -115,6 +135,75 @@ class Preprocessing:
          return self._full_dict
  
  
          return self._full_dict
  
  
+    def _fill_nan(self):
+        '''
+        Fill NaN values, either by propagation or by interpolation (linear or splines)
+        '''
+        logger.info("Filling NaN numerical values in the feature dataframe")
+        # We interpolate (linearly or with splines) only numerical columns
+        numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
+                   or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
+        # The interpolation
+        if self._config['PREPROCESSING']['fill_method'] == 'propagate':
+            self._dataframe[numerical_columns] =\
+                self._dataframe[numerical_columns].fillna(method='ffill')
+        elif self._config['PREPROCESSING']['fill_method'] == 'linear':
+            self._dataframe[numerical_columns] =\
+                self._dataframe[numerical_columns].interpolate()
+        elif self._config['PREPROCESSING']['fill_method'] == 'spline':
+            self._dataframe[numerical_columns] =\
+                self._dataframe[numerical_columns].interpolate(method='spline',
+                     order=self._config['PREPROCESSING'].getint('order'))
+
+        # For the categorical columns, NaN values are filled by duplicating
+        # the last known value (forward fill method)
+        logger.info("Filling NaN categorical values in the feature dataframe")
+        categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
+                   or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+        self._dataframe[categorical_columns] =\
+            self._dataframe[categorical_columns].fillna(method='ffill')
+
+        # Uncomment this line to fill NaN values at the beginning of the
+        # dataframe. This may not be a good idea, especially for features
+        # that are available only for recent years, e.g., air quality
+        #self._dataframe = self._dataframe.fillna(method='bfill')
+
+        # Dropping rows that are not related to our datetime window (start/
+        # step / end)
+        self._dataframe = self._dataframe.drop([k.to_pydatetime()
+                                               for k in self._dataframe.T
+                                               if k not in self._datetimes])
+
+
+    def _standardize(self):
+        '''
+        Normalizing numerical features
+        '''
+        logger.info("Standardizing numerical values in the feature dataframe")
+        # We operate only on numerical columns
+        numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
+                   or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
+        self._dataframe[numerical_columns] = preprocessing.scale(self._dataframe[numerical_columns])
+
+
+    def _one_hot_encoding(self):
+        '''
+        Apply a one hot encoding for category features
+        '''
+        logger.info("One hot encoding for categorical feature")
+        categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
+                   or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+
+        # On fait un codage disjonctif complet des variables qualitatives
+        df_out = pd.DataFrame()
+        for col in categorical_columns:
+            pd1 = pd.get_dummies(self._dataframe[col],prefix=col)
+            for col1 in pd1.columns:
+                df_out[col1] = pd1[col1]
+        self._dataframe = df_out
+        print(self._dataframe.head())
+
+
  
      @property
      def dataframe(self):
  
      @property
      def dataframe(self):
@@ -125,20 +214,12 @@ class Preprocessing:
              logger.info("Creating feature dataframe from feature dictionary")
              self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                       orient='index')
              logger.info("Creating feature dataframe from feature dictionary")
              self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                       orient='index')
-            logger.info("Filling NaN values in the feature dataframe")
-
-            if self._config['PREPROCESSING']['fill_method'] == 'propagate':
-                self._dataframe = self._dataframe.fillna(method='ffill')
-            elif self._config['PREPROCESSING']['fill_method'] == 'linear':
-                self._dataframe = self._dataframe.interpolate()
-            elif self._config['PREPROCESSING']['fill_method'] == 'spline':
-                self._dataframe = self._dataframe.interpolate(method='spline',
-                                                              order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
-
-            self._dataframe = self._dataframe.drop([k.to_pydatetime()
-                                                   for k in self._dataframe.T
-                                                   if k not in self._datetimes])
+            # Dealing with NaN values
+            self._fill_nan()
+            # Normalizing numerical values
+            self._standardize()
+            # Dealing with categorical features
+            self._one_hot_encoding()
          return self._dataframe
  
  
          return self._dataframe