predictops/learn/preprocessing.py

   1 from configparser import ConfigParser
   2 from csv import DictReader
   3 from datetime import datetime, timedelta
   4 from itertools import chain
   5 from logging import getLogger
   6 from logging.config import fileConfig
   7 from os import listdir
   8 from pathlib import Path
   9 from sklearn import preprocessing
  10
  11 import numpy as np
  12 import pandas as pd
  13
  14 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  15 logger = getLogger()
  16
  17 class Preprocessing:
  18     '''
  19     Generate a pandas dataframe from a dictionary of features per datetime, which
  20     respects the starting and ending dates of the study, and its precision (the
  21     time step) as passed to the constructor. Missing feature values are completed.
  22
  23      - Missing datetimes are added first with np.NaN feature values,
  24      - The dataframe is then constructed based on the filled feature dictionary,
  25      - NaN values are then filled with last known values.
  26     '''
  27
  28     def __init__(self, config_file = None, dict_features = None, features = None):
  29         '''
  30         Constructor that defines all needed attributes and collects features.
  31         '''
  32         self._config = ConfigParser()
  33         self._config.read(config_file)
  34
  35         self._start = datetime.strptime(self._config['DATETIME']['start'],
  36                                         '%m/%d/%Y %H:%M:%S')
  37         self._end = datetime.strptime(self._config['DATETIME']['end'],
  38                                         '%m/%d/%Y %H:%M:%S')
  39         self._timestep = timedelta(hours =
  40                                    self._config['DATETIME'].getfloat('hourStep'))
  41         self._dict_features = dict_features
  42         self._full_dict = None
  43         self._dataframe = None
  44         self._datetimes = []
  45         # If features are not provided to the constructor, then we collect
  46         # any existing feature in the dictionary
  47         if features != None:
  48             self._features = features
  49         else:
  50             self._features = set(chain.from_iterable([tuple(u.keys())
  51                                                       for u in [*dict_features.values()]]))
  52         feature_files = Path.cwd() / 'config' / 'features'
  53         self._features = {feat : {'numerical': False} for feat in self._features}
  54         for feature_file in listdir(feature_files):
  55             if feature_file.endswith('csv'):
  56                 with open(feature_files / feature_file , "r") as f:
  57                     reader = DictReader(f, delimiter=',')
  58                     typed_names = {row['name']: row['type'] for row in reader}
  59                 for feature in self._features:
  60                     if feature.split('_')[0] in typed_names:
  61                         self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
  62             elif feature_file.endswith('cfg'):
  63                 config = ConfigParser()
  64                 config.read(feature_files / feature_file)
  65                 for section in config:
  66                     if config.has_option(section, 'numerical'):
  67                         self._features[section]['numerical'] = config[section].getboolean('numerical')
  68
  69
  70
  71     @property
  72     def start(self):
  73         return self._start
  74
  75     @start.setter
  76     def start(self, x):
  77         self._start = x
  78
  79
  80     @property
  81     def end(self):
  82         return self._end
  83
  84     @end.setter
  85     def end(self, x):
  86         self._end = x
  87
  88
  89     @property
  90     def timestep(self):
  91         return self._timestep
  92
  93     @timestep.setter
  94     def timestep(self, x):
  95         self._timestep = x
  96
  97
  98     def _fill_dict(self):
  99         '''
 100         Add datetime keys in the dated feature dictionary that are missing. The
 101         features are then set to np.NaN. Add missing features in existing datetimes
 102         too.
 103         '''
 104         logger.info("Adding missing dates and filling missing features with NaN values")
 105         current = self._start
 106         while current <= self._end:
 107             self._datetimes.append(current)
 108             if current not in self._dict_features:
 109                 self._dict_features[current] = {feature:np.NaN
 110                                                 for feature in self._features}
 111             else:
 112                 null_dict = {feature:np.NaN
 113                              for feature in self._features}
 114                 null_dict.update(self._dict_features[current])
 115                 self._dict_features[current] = null_dict
 116             current += self._timestep
 117         for k in self._dict_features:
 118             null_dict = {feature:np.NaN
 119                          for feature in self._features}
 120             null_dict.update(self._dict_features[k])
 121             self._dict_features[k] = null_dict
 122
 123         self._full_dict = {k: self._dict_features[k]
 124                            for k in sorted(self._dict_features.keys())}
 125
 126
 127
 128     @property
 129     def full_dict(self):
 130         '''
 131         Returns the fully filled dated feature dictionary, ordered by datetimes
 132         '''
 133         if self._full_dict is None:
 134             self._fill_dict()
 135         return self._full_dict
 136
 137
 138     def _fill_nan(self):
 139         '''
 140         Fill NaN values, either by propagation or by interpolation (linear or splines)
 141         '''
 142         logger.info("Filling NaN numerical values in the feature dataframe")
 143         # We interpolate (linearly or with splines) only numerical columns
 144         numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
 145                    or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
 146         # The interpolation
 147         if self._config['PREPROCESSING']['fill_method'] == 'propagate':
 148             self._dataframe[numerical_columns] =\
 149                 self._dataframe[numerical_columns].fillna(method='ffill')
 150         elif self._config['PREPROCESSING']['fill_method'] == 'linear':
 151             self._dataframe[numerical_columns] =\
 152                 self._dataframe[numerical_columns].interpolate()
 153         elif self._config['PREPROCESSING']['fill_method'] == 'spline':
 154             self._dataframe[numerical_columns] =\
 155                 self._dataframe[numerical_columns].interpolate(method='spline',
 156                      order=self._config['PREPROCESSING'].getint('order'))
 157
 158         # For the categorical columns, NaN values are filled by duplicating
 159         # the last known value (forward fill method)
 160         logger.info("Filling NaN categorical values in the feature dataframe")
 161         categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
 162                    or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
 163         self._dataframe[categorical_columns] =\
 164             self._dataframe[categorical_columns].fillna(method='ffill')
 165
 166         # Uncomment this line to fill NaN values at the beginning of the
 167         # dataframe. This may not be a good idea, especially for features
 168         # that are available only for recent years, e.g., air quality
 169         #self._dataframe = self._dataframe.fillna(method='bfill')
 170
 171         # Dropping rows that are not related to our datetime window (start/
 172         # step / end)
 173         self._dataframe = self._dataframe.drop([k.to_pydatetime()
 174                                                for k in self._dataframe.T
 175                                                if k not in self._datetimes])
 176
 177
 178     def _standardize(self):
 179         '''
 180         Normalizing numerical features
 181         '''
 182         logger.info("Standardizing numerical values in the feature dataframe")
 183         # We operate only on numerical columns
 184         numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
 185                    or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
 186         self._dataframe[numerical_columns] = preprocessing.scale(self._dataframe[numerical_columns])
 187
 188
 189     def _one_hot_encoding(self):
 190         '''
 191         Apply a one hot encoding for category features
 192         '''
 193         logger.info("One hot encoding for categorical feature")
 194         categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
 195                    or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
 196
 197         # On fait un codage disjonctif complet des variables qualitatives
 198         df_out = pd.DataFrame()
 199         for col in categorical_columns:
 200             pd1 = pd.get_dummies(self._dataframe[col],prefix=col)
 201             for col1 in pd1.columns:
 202                 df_out[col1] = pd1[col1]
 203         self._dataframe = df_out
 204         print(self._dataframe.head())
 205
 206
 207
 208     @property
 209     def dataframe(self):
 210         '''
 211         Returns the feature dataframe, after creating it if needed.
 212         '''
 213         if self._dataframe is None:
 214             logger.info("Creating feature dataframe from feature dictionary")
 215             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
 216                                                      orient='index')
 217             # Dealing with NaN values
 218             self._fill_nan()
 219             # Normalizing numerical values
 220             self._standardize()
 221             # Dealing with categorical features
 222             self._one_hot_encoding()
 223         return self._dataframe
 224
 225
 226     @dataframe.setter
 227     def dataframe(self, df):
 228         self._dataframe = df
 229
 230