Refactoring, fin du lever/coucher de soleil, et début de sentinelles

[predictops.git] / predictops / learn / preprocessing.py
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 833e48316bffa1c51affc6885210b71f61b2c1d1..55cffbd2a0e094a610580e1e4dbcdf8adc80de5d 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,7 +1,9 @@
+from configparser import ConfigParser
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
  from pathlib import Path
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
  from pathlib import Path
+from sklearn import preprocessing
  
  import numpy as np
  import pandas as pd
  
  import numpy as np
  import pandas as pd
@@ -9,6 +11,7 @@ import pandas as pd
  fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  logger = getLogger()
  
  fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  logger = getLogger()
  
+
  class Preprocessing:
      '''
      Generate a pandas dataframe from a dictionary of features per datetime, which
  class Preprocessing:
      '''
      Generate a pandas dataframe from a dictionary of features per datetime, which
@@ -18,30 +21,73 @@ class Preprocessing:
       - Missing datetimes are added first with np.NaN feature values,
       - The dataframe is then constructed based on the filled feature dictionary,
       - NaN values are then filled with last known values.
       - Missing datetimes are added first with np.NaN feature values,
       - The dataframe is then constructed based on the filled feature dictionary,
       - NaN values are then filled with last known values.
-
      '''
      '''
-    def __init__(self, dict_features,
-                 start, end, timestep,
-                 features = None):
+
+    def __init__(self, config_file=None,
+                 start=None, end=None, timestep=None,
+                 dict_features=None, dict_target=None):
          '''
          Constructor that defines all needed attributes and collects features.
          '''
          '''
          Constructor that defines all needed attributes and collects features.
          '''
-        logger.info("Entering  NaN values in the feature dataframe")
-        self._dict_features = dict_features
+        self._config = config_file
+
          self._start = start
          self._end = end
          self._timestep = timestep
          self._start = start
          self._end = end
          self._timestep = timestep
+        self._dict_features = dict_features
+        self._dict_target = dict_target
+
          self._full_dict = None
          self._dataframe = None
          self._datetimes = []
          self._full_dict = None
          self._dataframe = None
          self._datetimes = []
-        # If features are not provided to the constructor, then we collect
-        # any existing feature in the dictionary
-        if features != None:
-            self._features = features
-        else:
-            self._features = set(chain.from_iterable([tuple(u.keys())
-                                                      for u in [*dict_features.values()]]))
  
  
+        self._features = set(chain.from_iterable([tuple(u.keys())
+                                                  for u in [*dict_features.values()]]))
+
+        #feature_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat: {'numerical': False, 'categorical': False}
+                          for feat in self._features}
+
+        for feature in self._config['FEATURES']:
+            if self._config['FEATURES'][feature]:
+                feature_file = self._config['FEATURE_CONFIG'][feature]
+                config = ConfigParser()
+                config.read(eval(feature_file))
+                for section in config:
+                    if config.has_option(section, 'numerical'):
+                        for feature in self._features:
+                            if feature.split('_')[0] == section:
+                                self._features[feature]['binary'] = config[section].getboolean('binary')
+                                self._features[feature]['categorical'] = config[section].getboolean('categorical')
+                                self._features[feature]['numerical'] = config[section].getboolean('numerical')
+
+        self._binary_columns = [k for k in self._features if self._features[k]['binary']]
+        self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
+        self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
+
+    @property
+    def start(self):
+        return self._start
+
+    @start.setter
+    def start(self, x):
+        self._start = x
+
+    @property
+    def end(self):
+        return self._end
+
+    @end.setter
+    def end(self, x):
+        self._end = x
+
+    @property
+    def timestep(self):
+        return self._timestep
+
+    @timestep.setter
+    def timestep(self, x):
+        self._timestep = x
  
      def _fill_dict(self):
          '''
  
      def _fill_dict(self):
          '''
@@ -54,16 +100,16 @@ class Preprocessing:
          while current <= self._end:
              self._datetimes.append(current)
              if current not in self._dict_features:
          while current <= self._end:
              self._datetimes.append(current)
              if current not in self._dict_features:
-                self._dict_features[current] = {feature:np.NaN
+                self._dict_features[current] = {feature: np.NaN
                                                  for feature in self._features}
              else:
                                                  for feature in self._features}
              else:
-                null_dict = {feature:np.NaN
+                null_dict = {feature: np.NaN
                               for feature in self._features}
                  null_dict.update(self._dict_features[current])
                  self._dict_features[current] = null_dict
              current += self._timestep
          for k in self._dict_features:
                               for feature in self._features}
                  null_dict.update(self._dict_features[current])
                  self._dict_features[current] = null_dict
              current += self._timestep
          for k in self._dict_features:
-            null_dict = {feature:np.NaN
+            null_dict = {feature: np.NaN
                           for feature in self._features}
              null_dict.update(self._dict_features[k])
              self._dict_features[k] = null_dict
                           for feature in self._features}
              null_dict.update(self._dict_features[k])
              self._dict_features[k] = null_dict
@@ -71,8 +117,6 @@ class Preprocessing:
          self._full_dict = {k: self._dict_features[k]
                             for k in sorted(self._dict_features.keys())}
  
          self._full_dict = {k: self._dict_features[k]
                             for k in sorted(self._dict_features.keys())}
  
-
-
      @property
      def full_dict(self):
          '''
      @property
      def full_dict(self):
          '''
@@ -82,7 +126,85 @@ class Preprocessing:
              self._fill_dict()
          return self._full_dict
  
              self._fill_dict()
          return self._full_dict
  
+    def _fill_nan(self):
+        '''
+        Fill NaN values, either by propagation or by interpolation (linear or splines)
+        '''
+        logger.info("Filling NaN numerical values in the feature dataframe")
+        # We interpolate (linearly or with splines) only numerical columns
+        # The interpolation
+        if self._config['PREPROCESSING']['fill_method'] == 'propagate':
+            self._dataframe[self._numerical_columns] =\
+                self._dataframe[self._numerical_columns].fillna(method='ffill')
+        elif self._config['PREPROCESSING']['fill_method'] == 'linear':
+            self._dataframe[self._numerical_columns] =\
+                self._dataframe[self._numerical_columns].interpolate()
+        elif self._config['PREPROCESSING']['fill_method'] == 'spline':
+            self._dataframe[self._numerical_columns] =\
+                self._dataframe[self._numerical_columns].interpolate(method='spline',
+                                                                     order=self._config['PREPROCESSING'].getint('order'))
+
+        # For the categorical columns, NaN values are filled by duplicating
+        # the last known value (forward fill method)
+        logger.info("Filling NaN categorical values in the feature dataframe")
+        self._dataframe[self._categorical_columns] =\
+            self._dataframe[self._categorical_columns].fillna(method='ffill')
+
+        # Uncomment this line to fill NaN values at the beginning of the
+        # dataframe. This may not be a good idea, especially for features
+        # that are available only for recent years, e.g., air quality
+        #self._dataframe = self._dataframe.fillna(method='bfill')
+
+        # Dropping rows that are not related to our datetime window (start/
+        # step / end)
+        logger.info("Dropping rows that are not related to our datetime window")
+        dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
+        self._dataframe['row_ok'] =\
+            self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
+        self._dataframe = self._dataframe[self._dataframe['row_ok']]
+        self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
+        logger.info("Rows dropped")
+
+    def _add_history(self):
+        '''
+        Integrating previous nb of interventions as features
+        '''
+        logger.info("Integrating previous nb of interventions as features")
+        nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
+        for k in range(1, nb_lines + 1):
+            name = 'history_' + str(nb_lines - k + 1)
+            self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
+            self._numerical_columns.append(name)
+        self._dataframe = self._dataframe[nb_lines:]
+
+    def _standardize(self):
+        '''
+        Normalizing numerical features
+        '''
+        logger.info("Standardizing numerical values in the feature dataframe")
+        # We operate only on numerical columns
+        self._dataframe[self._numerical_columns] =\
+            preprocessing.scale(self._dataframe[self._numerical_columns])
  
  
+    def _one_hot_encoding(self):
+        '''
+        Apply a one hot encoding for category features
+        '''
+        logger.info("One hot encoding for categorical feature")
+        # We store numerical columns
+
+        df_out = pd.DataFrame()
+        for col in self._numerical_columns:
+            df_out[col] = self._dataframe[col]
+        # Idem for binary features
+        for col in self._binary_columns:
+            df_out[col] = self._dataframe[col]
+        # The one hot encoding
+        for col in self._categorical_columns:
+            pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
+            for col1 in pd1.columns:
+                df_out[col1] = pd1[col1]
+        self._dataframe = df_out
  
      @property
      def dataframe(self):
  
      @property
      def dataframe(self):
@@ -93,13 +215,16 @@ class Preprocessing:
              logger.info("Creating feature dataframe from feature dictionary")
              self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                       orient='index')
              logger.info("Creating feature dataframe from feature dictionary")
              self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                       orient='index')
-            logger.info("Filling NaN values in the feature dataframe")
-            #TODO: add other filling methods like linear interpolation
-            self._dataframe = self._dataframe.fillna(method='ffill')
-            self._dataframe = self._dataframe.fillna(method='bfill')
-            self._dataframe = self._dataframe.drop([k.to_pydatetime()
-                                                   for k in self._dataframe.T
-                                                   if k not in self._datetimes])
+            # Dealing with NaN values
+            self._fill_nan()
+            # Adding previous (historical) nb_interventions as features
+            self._add_history()
+            # self._dataframe.to_csv('toto.csv')
+            # exit()
+            # Normalizing numerical values
+            self._standardize()
+            # Dealing with categorical features
+            self._one_hot_encoding()
          return self._dataframe
  
      @dataframe.setter
          return self._dataframe
  
      @dataframe.setter