predictops/learn/preprocessing.py

   1 from configparser import ConfigParser
   2 from datetime import datetime, timedelta
   3 from itertools import chain
   4 from logging import getLogger
   5 from logging.config import fileConfig
   6 from pathlib import Path
   7
   8 import numpy as np
   9 import pandas as pd
  10
  11 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  12 logger = getLogger()
  13
  14 class Preprocessing:
  15     '''
  16     Generate a pandas dataframe from a dictionary of features per datetime, which
  17     respects the starting and ending dates of the study, and its precision (the
  18     time step) as passed to the constructor. Missing feature values are completed.
  19
  20      - Missing datetimes are added first with np.NaN feature values,
  21      - The dataframe is then constructed based on the filled feature dictionary,
  22      - NaN values are then filled with last known values.
  23     '''
  24
  25     def __init__(self, config_file = None, dict_features = None, features = None):
  26         '''
  27         Constructor that defines all needed attributes and collects features.
  28         '''
  29         self._config = ConfigParser()
  30         self._config.read(config_file)
  31
  32         self._start = datetime.strptime(self._config['DATETIME']['start'],
  33                                         '%m/%d/%Y %H:%M:%S')
  34         self._end = datetime.strptime(self._config['DATETIME']['end'],
  35                                         '%m/%d/%Y %H:%M:%S')
  36         self._timestep = timedelta(hours =
  37                                    self._config['DATETIME'].getfloat('hourStep'))
  38         self._dict_features = dict_features
  39         self._full_dict = None
  40         self._dataframe = None
  41         self._datetimes = []
  42         # If features are not provided to the constructor, then we collect
  43         # any existing feature in the dictionary
  44         if features != None:
  45             self._features = features
  46         else:
  47             self._features = set(chain.from_iterable([tuple(u.keys())
  48                                                       for u in [*dict_features.values()]]))
  49
  50
  51     @property
  52     def start(self):
  53         return self._start
  54
  55     @start.setter
  56     def start(self, x):
  57         self._start = x
  58
  59
  60     @property
  61     def end(self):
  62         return self._end
  63
  64     @end.setter
  65     def end(self, x):
  66         self._end = x
  67
  68
  69     @property
  70     def timestep(self):
  71         return self._timestep
  72
  73     @timestep.setter
  74     def timestep(self, x):
  75         self._timestep = x
  76
  77
  78     def _fill_dict(self):
  79         '''
  80         Add datetime keys in the dated feature dictionary that are missing. The
  81         features are then set to np.NaN. Add missing features in existing datetimes
  82         too.
  83         '''
  84         logger.info("Adding missing dates and filling missing features with NaN values")
  85         current = self._start
  86         while current <= self._end:
  87             self._datetimes.append(current)
  88             if current not in self._dict_features:
  89                 self._dict_features[current] = {feature:np.NaN
  90                                                 for feature in self._features}
  91             else:
  92                 null_dict = {feature:np.NaN
  93                              for feature in self._features}
  94                 null_dict.update(self._dict_features[current])
  95                 self._dict_features[current] = null_dict
  96             current += self._timestep
  97         for k in self._dict_features:
  98             null_dict = {feature:np.NaN
  99                          for feature in self._features}
 100             null_dict.update(self._dict_features[k])
 101             self._dict_features[k] = null_dict
 102
 103         self._full_dict = {k: self._dict_features[k]
 104                            for k in sorted(self._dict_features.keys())}
 105
 106
 107
 108     @property
 109     def full_dict(self):
 110         '''
 111         Returns the fully filled dated feature dictionary, ordered by datetimes
 112         '''
 113         if self._full_dict is None:
 114             self._fill_dict()
 115         return self._full_dict
 116
 117
 118
 119     @property
 120     def dataframe(self):
 121         '''
 122         Returns the feature dataframe, after creating it if needed.
 123         '''
 124         if self._dataframe is None:
 125             logger.info("Creating feature dataframe from feature dictionary")
 126             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
 127                                                      orient='index')
 128             logger.info("Filling NaN values in the feature dataframe")
 129
 130             if self._config['PREPROCESSING']['fill_method'] == 'propagate':
 131                 self._dataframe = self._dataframe.fillna(method='ffill')
 132             elif self._config['PREPROCESSING']['fill_method'] == 'linear':
 133                 self._dataframe = self._dataframe.interpolate()
 134             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
 135                 self._dataframe = self._dataframe.interpolate(method='spline',
 136                                                               order=self._config['PREPROCESSING'].getint('order'))
 137             self._dataframe = self._dataframe.fillna(method='bfill')
 138
 139             self._dataframe = self._dataframe.drop([k.to_pydatetime()
 140                                                    for k in self._dataframe.T
 141                                                    if k not in self._datetimes])
 142         return self._dataframe
 143
 144
 145     @dataframe.setter
 146     def dataframe(self, df):
 147         self._dataframe = df
 148
 149