Refactoring, and categorical / numerical / mixed NaN values are now

[predictops.git] / predictops / learn / preprocessing.py
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..939a7fa30e79d45314adec6f8d526362137e3e9b 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,8 +1,10 @@
  from configparser import ConfigParser
  from configparser import ConfigParser
+from csv import DictReader
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
+from os import listdir
  from pathlib import Path
  
  import numpy as np
  from pathlib import Path
  
  import numpy as np
@@ -46,6 +48,23 @@ class Preprocessing:
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
+        feature_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : {'numerical': False} for feat in self._features}
+        for feature_file in listdir(feature_files):
+            if feature_file.endswith('csv'):
+                with open(feature_files / feature_file , "r") as f:
+                    reader = DictReader(f, delimiter=',')
+                    typed_names = {row['name']: row['type'] for row in reader}
+                for feature in self._features:
+                    if feature.split('_')[0] in typed_names:
+                        self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
+            elif feature_file.endswith('cfg'):
+                config = ConfigParser()
+                config.read(feature_files / feature_file)
+                for section in config:
+                    if config.has_option(section, 'numerical'):
+                        self._features[section]['numerical'] = config[section].getboolean('numerical')
+
  
  
      @property
  
  
      @property
@@ -125,17 +144,37 @@ class Preprocessing:
              logger.info("Creating feature dataframe from feature dictionary")
              self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                       orient='index')
              logger.info("Creating feature dataframe from feature dictionary")
              self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                       orient='index')
-            logger.info("Filling NaN values in the feature dataframe")
-
+            logger.info("Filling NaN numerical values in the feature dataframe")
+            # We interpolate (linearly or with splines) only numerical columns
+            numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
+                       or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
+            # The interpolation
              if self._config['PREPROCESSING']['fill_method'] == 'propagate':
              if self._config['PREPROCESSING']['fill_method'] == 'propagate':
-                self._dataframe = self._dataframe.fillna(method='ffill')
+                self._dataframe[numerical_columns] =\
+                    self._dataframe[numerical_columns].fillna(method='ffill')
              elif self._config['PREPROCESSING']['fill_method'] == 'linear':
              elif self._config['PREPROCESSING']['fill_method'] == 'linear':
-                self._dataframe = self._dataframe.interpolate()
+                self._dataframe[numerical_columns] =\
+                    self._dataframe[numerical_columns].interpolate()
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
-                self._dataframe = self._dataframe.interpolate(method='spline',
-                                                              order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
-
+                self._dataframe[numerical_columns] =\
+                    self._dataframe[numerical_columns].interpolate(method='spline',
+                         order=self._config['PREPROCESSING'].getint('order'))
+
+            # For the categorical columns, NaN values are filled by duplicating
+            # the last known value (forward fill method)
+            logger.info("Filling NaN categorical values in the feature dataframe")
+            categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
+                       or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+            self._dataframe[categorical_columns] =\
+                self._dataframe[categorical_columns].fillna(method='ffill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Dropping rows that are not related to our datetime window (start/
+            # step / end)
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T
                                                     if k not in self._datetimes])
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T
                                                     if k not in self._datetimes])