predictops/learn/preprocessing.py

   1 from configparser import ConfigParser
   2 from csv import DictReader
   3 from datetime import datetime, timedelta
   4 from itertools import chain
   5 from logging import getLogger
   6 from logging.config import fileConfig
   7 from os import listdir
   8 from pathlib import Path
   9 from sklearn import preprocessing
  10
  11 import numpy as np
  12 import pandas as pd
  13
  14 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  15 logger = getLogger()
  16
  17
  18 class Preprocessing:
  19     '''
  20     Generate a pandas dataframe from a dictionary of features per datetime, which
  21     respects the starting and ending dates of the study, and its precision (the
  22     time step) as passed to the constructor. Missing feature values are completed.
  23
  24      - Missing datetimes are added first with np.NaN feature values,
  25      - The dataframe is then constructed based on the filled feature dictionary,
  26      - NaN values are then filled with last known values.
  27     '''
  28
  29     def __init__(self, config_file=None,
  30                  dict_features=None, dict_target=None):
  31         '''
  32         Constructor that defines all needed attributes and collects features.
  33         '''
  34         self._config = config_file
  35
  36         self._start = datetime.strptime(self._config['DATETIME']['start'],
  37                                         '%m/%d/%Y %H:%M:%S')
  38         self._end = datetime.strptime(self._config['DATETIME']['end'],
  39                                       '%m/%d/%Y %H:%M:%S')
  40         self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep'))
  41         self._dict_features = dict_features
  42         self._dict_target = dict_target
  43
  44         self._full_dict = None
  45         self._dataframe = None
  46         self._datetimes = []
  47
  48         self._features = set(chain.from_iterable([tuple(u.keys())
  49                                                   for u in [*dict_features.values()]]))
  50
  51         #feature_files = Path.cwd() / 'config' / 'features'
  52         self._features = {feat: {'numerical': False, 'categorical': False}
  53                           for feat in self._features}
  54
  55         for feature in self._config['FEATURES']:
  56             if self._config['FEATURES'][feature]:
  57                 feature_file = self._config['FEATURE_CONFIG'][feature]
  58                 config = ConfigParser()
  59                 config.read(eval(feature_file))
  60                 for section in config:
  61                     if config.has_option(section, 'numerical'):
  62                         for feature in self._features:
  63                             if feature.split('_')[0] == section:
  64                                 self._features[feature]['binary'] = config[section].getboolean('binary')
  65                                 self._features[feature]['categorical'] = config[section].getboolean('categorical')
  66                                 self._features[feature]['numerical'] = config[section].getboolean('numerical')
  67
  68         self._binary_columns = [k for k in self._features if self._features[k]['binary']]
  69         self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
  70         self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
  71
  72     @property
  73     def start(self):
  74         return self._start
  75
  76     @start.setter
  77     def start(self, x):
  78         self._start = x
  79
  80     @property
  81     def end(self):
  82         return self._end
  83
  84     @end.setter
  85     def end(self, x):
  86         self._end = x
  87
  88     @property
  89     def timestep(self):
  90         return self._timestep
  91
  92     @timestep.setter
  93     def timestep(self, x):
  94         self._timestep = x
  95
  96     def _fill_dict(self):
  97         '''
  98         Add datetime keys in the dated feature dictionary that are missing. The
  99         features are then set to np.NaN. Add missing features in existing datetimes
 100         too.
 101         '''
 102         logger.info("Adding missing dates and filling missing features with NaN values")
 103         current = self._start
 104         while current <= self._end:
 105             self._datetimes.append(current)
 106             if current not in self._dict_features:
 107                 self._dict_features[current] = {feature: np.NaN
 108                                                 for feature in self._features}
 109             else:
 110                 null_dict = {feature: np.NaN
 111                              for feature in self._features}
 112                 null_dict.update(self._dict_features[current])
 113                 self._dict_features[current] = null_dict
 114             current += self._timestep
 115         for k in self._dict_features:
 116             null_dict = {feature: np.NaN
 117                          for feature in self._features}
 118             null_dict.update(self._dict_features[k])
 119             self._dict_features[k] = null_dict
 120
 121         self._full_dict = {k: self._dict_features[k]
 122                            for k in sorted(self._dict_features.keys())}
 123
 124     @property
 125     def full_dict(self):
 126         '''
 127         Returns the fully filled dated feature dictionary, ordered by datetimes
 128         '''
 129         if self._full_dict is None:
 130             self._fill_dict()
 131         return self._full_dict
 132
 133     def _fill_nan(self):
 134         '''
 135         Fill NaN values, either by propagation or by interpolation (linear or splines)
 136         '''
 137         logger.info("Filling NaN numerical values in the feature dataframe")
 138         # We interpolate (linearly or with splines) only numerical columns
 139         # The interpolation
 140         if self._config['PREPROCESSING']['fill_method'] == 'propagate':
 141             self._dataframe[self._numerical_columns] =\
 142                 self._dataframe[self._numerical_columns].fillna(method='ffill')
 143         elif self._config['PREPROCESSING']['fill_method'] == 'linear':
 144             self._dataframe[self._numerical_columns] =\
 145                 self._dataframe[self._numerical_columns].interpolate()
 146         elif self._config['PREPROCESSING']['fill_method'] == 'spline':
 147             self._dataframe[self._numerical_columns] =\
 148                 self._dataframe[self._numerical_columns].interpolate(method='spline',
 149                                                                      order=self._config['PREPROCESSING'].getint('order'))
 150
 151         # For the categorical columns, NaN values are filled by duplicating
 152         # the last known value (forward fill method)
 153         logger.info("Filling NaN categorical values in the feature dataframe")
 154         self._dataframe[self._categorical_columns] =\
 155             self._dataframe[self._categorical_columns].fillna(method='ffill')
 156
 157         # Uncomment this line to fill NaN values at the beginning of the
 158         # dataframe. This may not be a good idea, especially for features
 159         # that are available only for recent years, e.g., air quality
 160         #self._dataframe = self._dataframe.fillna(method='bfill')
 161
 162         # Dropping rows that are not related to our datetime window (start/
 163         # step / end)
 164         logger.info("Dropping rows that are not related to our datetime window")
 165         dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
 166         self._dataframe['row_ok'] =\
 167             self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
 168         self._dataframe = self._dataframe[self._dataframe['row_ok']]
 169         self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
 170         logger.info("Rows dropped")
 171
 172     def _add_history(self):
 173         '''
 174         Integrating previous nb of interventions as features
 175         '''
 176         logger.info("Integrating previous nb of interventions as features")
 177         nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
 178         for k in range(1, nb_lines + 1):
 179             name = 'history_' + str(nb_lines - k + 1)
 180             self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
 181             self._numerical_columns.append(name)
 182         self._dataframe = self._dataframe[nb_lines:]
 183
 184     def _standardize(self):
 185         '''
 186         Normalizing numerical features
 187         '''
 188         logger.info("Standardizing numerical values in the feature dataframe")
 189         # We operate only on numerical columns
 190         self._dataframe[self._numerical_columns] =\
 191             preprocessing.scale(self._dataframe[self._numerical_columns])
 192
 193     def _one_hot_encoding(self):
 194         '''
 195         Apply a one hot encoding for category features
 196         '''
 197         logger.info("One hot encoding for categorical feature")
 198         # We store numerical columns
 199         df_out = pd.DataFrame()
 200         for col in self._numerical_columns:
 201             df_out[col] = self._dataframe[col]
 202         # Idem for binary features
 203         for col in self._binary_columns:
 204             df_out[col] = self._dataframe[col]
 205         # The one hot encoding
 206         for col in self._categorical_columns:
 207             pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
 208             for col1 in pd1.columns:
 209                 df_out[col1] = pd1[col1]
 210         self._dataframe = df_out
 211
 212     @property
 213     def dataframe(self):
 214         '''
 215         Returns the feature dataframe, after creating it if needed.
 216         '''
 217         if self._dataframe is None:
 218             logger.info("Creating feature dataframe from feature dictionary")
 219             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
 220                                                      orient='index')
 221             # Dealing with NaN values
 222             self._fill_nan()
 223             # Adding previous (historical) nb_interventions as features
 224             self._add_history()
 225             # Normalizing numerical values
 226             self._standardize()
 227             # Dealing with categorical features
 228             self._one_hot_encoding()
 229         return self._dataframe
 230
 231     @dataframe.setter
 232     def dataframe(self, df):
 233         self._dataframe = df
 234
 235