predictops/learn/preprocessing.py

   1 from configparser import ConfigParser
   2 from itertools import chain
   3 from logging import getLogger
   4 from logging.config import fileConfig
   5 from pathlib import Path
   6 from sklearn import preprocessing
   7
   8 import numpy as np
   9 import pandas as pd
  10
  11 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  12 logger = getLogger()
  13
  14
  15 class Preprocessing:
  16     '''
  17     Generate a pandas dataframe from a dictionary of features per datetime, which
  18     respects the starting and ending dates of the study, and its precision (the
  19     time step) as passed to the constructor. Missing feature values are completed.
  20
  21      - Missing datetimes are added first with np.NaN feature values,
  22      - The dataframe is then constructed based on the filled feature dictionary,
  23      - NaN values are then filled with last known values.
  24     '''
  25
  26     def __init__(self, config_file=None,
  27                  start=None, end=None, timestep=None,
  28                  dict_features=None, dict_target=None):
  29         '''
  30         Constructor that defines all needed attributes and collects features.
  31         '''
  32         self._config = config_file
  33
  34         self._start = start
  35         self._end = end
  36         self._timestep = timestep
  37         self._dict_features = dict_features
  38         self._dict_target = dict_target
  39
  40         self._full_dict = None
  41         self._dataframe = None
  42         self._datetimes = []
  43
  44         self._features = set(chain.from_iterable([tuple(u.keys())
  45                                                   for u in [*dict_features.values()]]))
  46
  47         #feature_files = Path.cwd() / 'config' / 'features'
  48         self._features = {feat: {'numerical': False, 'categorical': False}
  49                           for feat in self._features}
  50
  51         for feature in self._config['FEATURES']:
  52             if self._config['FEATURES'][feature]:
  53                 feature_file = self._config['FEATURE_CONFIG'][feature]
  54                 config = ConfigParser()
  55                 config.read(eval(feature_file))
  56                 for section in config:
  57                     if config.has_option(section, 'numerical'):
  58                         for feature in self._features:
  59                             if feature.split('_')[0] == section:
  60                                 self._features[feature]['binary'] = config[section].getboolean('binary')
  61                                 self._features[feature]['categorical'] = config[section].getboolean('categorical')
  62                                 self._features[feature]['numerical'] = config[section].getboolean('numerical')
  63
  64         self._binary_columns = [k for k in self._features if self._features[k]['binary']]
  65         self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
  66         self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
  67
  68     @property
  69     def start(self):
  70         return self._start
  71
  72     @start.setter
  73     def start(self, x):
  74         self._start = x
  75
  76     @property
  77     def end(self):
  78         return self._end
  79
  80     @end.setter
  81     def end(self, x):
  82         self._end = x
  83
  84     @property
  85     def timestep(self):
  86         return self._timestep
  87
  88     @timestep.setter
  89     def timestep(self, x):
  90         self._timestep = x
  91
  92     def _fill_dict(self):
  93         '''
  94         Add datetime keys in the dated feature dictionary that are missing. The
  95         features are then set to np.NaN. Add missing features in existing datetimes
  96         too.
  97         '''
  98         logger.info("Adding missing dates and filling missing features with NaN values")
  99         current = self._start
 100         while current <= self._end:
 101             self._datetimes.append(current)
 102             if current not in self._dict_features:
 103                 self._dict_features[current] = {feature: np.NaN
 104                                                 for feature in self._features}
 105             else:
 106                 null_dict = {feature: np.NaN
 107                              for feature in self._features}
 108                 null_dict.update(self._dict_features[current])
 109                 self._dict_features[current] = null_dict
 110             current += self._timestep
 111         for k in self._dict_features:
 112             null_dict = {feature: np.NaN
 113                          for feature in self._features}
 114             null_dict.update(self._dict_features[k])
 115             self._dict_features[k] = null_dict
 116
 117         self._full_dict = {k: self._dict_features[k]
 118                            for k in sorted(self._dict_features.keys())}
 119
 120     @property
 121     def full_dict(self):
 122         '''
 123         Returns the fully filled dated feature dictionary, ordered by datetimes
 124         '''
 125         if self._full_dict is None:
 126             self._fill_dict()
 127         return self._full_dict
 128
 129     def _fill_nan(self):
 130         '''
 131         Fill NaN values, either by propagation or by interpolation (linear or splines)
 132         '''
 133         logger.info("Filling NaN numerical values in the feature dataframe")
 134         # We interpolate (linearly or with splines) only numerical columns
 135         # The interpolation
 136         if self._config['PREPROCESSING']['fill_method'] == 'propagate':
 137             self._dataframe[self._numerical_columns] =\
 138                 self._dataframe[self._numerical_columns].fillna(method='ffill')
 139         elif self._config['PREPROCESSING']['fill_method'] == 'linear':
 140             self._dataframe[self._numerical_columns] =\
 141                 self._dataframe[self._numerical_columns].interpolate()
 142         elif self._config['PREPROCESSING']['fill_method'] == 'spline':
 143             self._dataframe[self._numerical_columns] =\
 144                 self._dataframe[self._numerical_columns].interpolate(method='spline',
 145                                                                      order=self._config['PREPROCESSING'].getint('order'))
 146
 147         # For the categorical columns, NaN values are filled by duplicating
 148         # the last known value (forward fill method)
 149         logger.info("Filling NaN categorical values in the feature dataframe")
 150         self._dataframe[self._categorical_columns] =\
 151             self._dataframe[self._categorical_columns].fillna(method='ffill')
 152
 153         # Uncomment this line to fill NaN values at the beginning of the
 154         # dataframe. This may not be a good idea, especially for features
 155         # that are available only for recent years, e.g., air quality
 156         #self._dataframe = self._dataframe.fillna(method='bfill')
 157
 158         # Dropping rows that are not related to our datetime window (start/
 159         # step / end)
 160         logger.info("Dropping rows that are not related to our datetime window")
 161         dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
 162         self._dataframe['row_ok'] =\
 163             self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
 164         self._dataframe = self._dataframe[self._dataframe['row_ok']]
 165         self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
 166         logger.info("Rows dropped")
 167
 168     def _add_history(self):
 169         '''
 170         Integrating previous nb of interventions as features
 171         '''
 172         logger.info("Integrating previous nb of interventions as features")
 173         nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
 174         for k in range(1, nb_lines + 1):
 175             name = 'history_' + str(nb_lines - k + 1)
 176             self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
 177             self._numerical_columns.append(name)
 178         self._dataframe = self._dataframe[nb_lines:]
 179
 180     def _standardize(self):
 181         '''
 182         Normalizing numerical features
 183         '''
 184         logger.info("Standardizing numerical values in the feature dataframe")
 185         # We operate only on numerical columns
 186         self._dataframe[self._numerical_columns] =\
 187             preprocessing.scale(self._dataframe[self._numerical_columns])
 188
 189     def _one_hot_encoding(self):
 190         '''
 191         Apply a one hot encoding for category features
 192         '''
 193         logger.info("One hot encoding for categorical feature")
 194         # We store numerical columns
 195
 196         df_out = pd.DataFrame()
 197         for col in self._numerical_columns:
 198             df_out[col] = self._dataframe[col]
 199         # Idem for binary features
 200         for col in self._binary_columns:
 201             df_out[col] = self._dataframe[col]
 202         # The one hot encoding
 203         for col in self._categorical_columns:
 204             pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
 205             for col1 in pd1.columns:
 206                 df_out[col1] = pd1[col1]
 207         self._dataframe = df_out
 208
 209     @property
 210     def dataframe(self):
 211         '''
 212         Returns the feature dataframe, after creating it if needed.
 213         '''
 214         if self._dataframe is None:
 215             logger.info("Creating feature dataframe from feature dictionary")
 216             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
 217                                                      orient='index')
 218             # Dealing with NaN values
 219             self._fill_nan()
 220             # Adding previous (historical) nb_interventions as features
 221             self._add_history()
 222             # self._dataframe.to_csv('toto.csv')
 223             # exit()
 224             # Normalizing numerical values
 225             self._standardize()
 226             # Dealing with categorical features
 227             self._one_hot_encoding()
 228         return self._dataframe
 229
 230     @dataframe.setter
 231     def dataframe(self, df):
 232         self._dataframe = df
 233
 234