predictops/learn/preprocessing.py

   1 from configparser import ConfigParser
   2 from csv import DictReader
   3 from datetime import datetime, timedelta
   4 from itertools import chain
   5 from logging import getLogger
   6 from logging.config import fileConfig
   7 from os import listdir
   8 from pathlib import Path
   9
  10 import numpy as np
  11 import pandas as pd
  12
  13 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  14 logger = getLogger()
  15
  16 class Preprocessing:
  17     '''
  18     Generate a pandas dataframe from a dictionary of features per datetime, which
  19     respects the starting and ending dates of the study, and its precision (the
  20     time step) as passed to the constructor. Missing feature values are completed.
  21
  22      - Missing datetimes are added first with np.NaN feature values,
  23      - The dataframe is then constructed based on the filled feature dictionary,
  24      - NaN values are then filled with last known values.
  25     '''
  26
  27     def __init__(self, config_file = None, dict_features = None, features = None):
  28         '''
  29         Constructor that defines all needed attributes and collects features.
  30         '''
  31         self._config = ConfigParser()
  32         self._config.read(config_file)
  33
  34         self._start = datetime.strptime(self._config['DATETIME']['start'],
  35                                         '%m/%d/%Y %H:%M:%S')
  36         self._end = datetime.strptime(self._config['DATETIME']['end'],
  37                                         '%m/%d/%Y %H:%M:%S')
  38         self._timestep = timedelta(hours =
  39                                    self._config['DATETIME'].getfloat('hourStep'))
  40         self._dict_features = dict_features
  41         self._full_dict = None
  42         self._dataframe = None
  43         self._datetimes = []
  44         # If features are not provided to the constructor, then we collect
  45         # any existing feature in the dictionary
  46         if features != None:
  47             self._features = features
  48         else:
  49             self._features = set(chain.from_iterable([tuple(u.keys())
  50                                                       for u in [*dict_features.values()]]))
  51         csv_files = Path.cwd() / 'config' / 'features'
  52         self._features = {feat : None for feat in self._features}
  53         for csv_file in listdir(csv_files):
  54             with open(csv_files / csv_file, "r") as f:
  55                 reader = DictReader(f, delimiter=',')
  56                 for row in reader:
  57                     if row['name'] in self._features:
  58                         self._features[row['name']] = row['type']
  59         print(self._features)
  60         exit()
  61
  62
  63     @property
  64     def start(self):
  65         return self._start
  66
  67     @start.setter
  68     def start(self, x):
  69         self._start = x
  70
  71
  72     @property
  73     def end(self):
  74         return self._end
  75
  76     @end.setter
  77     def end(self, x):
  78         self._end = x
  79
  80
  81     @property
  82     def timestep(self):
  83         return self._timestep
  84
  85     @timestep.setter
  86     def timestep(self, x):
  87         self._timestep = x
  88
  89
  90     def _fill_dict(self):
  91         '''
  92         Add datetime keys in the dated feature dictionary that are missing. The
  93         features are then set to np.NaN. Add missing features in existing datetimes
  94         too.
  95         '''
  96         logger.info("Adding missing dates and filling missing features with NaN values")
  97         current = self._start
  98         while current <= self._end:
  99             self._datetimes.append(current)
 100             if current not in self._dict_features:
 101                 self._dict_features[current] = {feature:np.NaN
 102                                                 for feature in self._features}
 103             else:
 104                 null_dict = {feature:np.NaN
 105                              for feature in self._features}
 106                 null_dict.update(self._dict_features[current])
 107                 self._dict_features[current] = null_dict
 108             current += self._timestep
 109         for k in self._dict_features:
 110             null_dict = {feature:np.NaN
 111                          for feature in self._features}
 112             null_dict.update(self._dict_features[k])
 113             self._dict_features[k] = null_dict
 114
 115         self._full_dict = {k: self._dict_features[k]
 116                            for k in sorted(self._dict_features.keys())}
 117
 118
 119
 120     @property
 121     def full_dict(self):
 122         '''
 123         Returns the fully filled dated feature dictionary, ordered by datetimes
 124         '''
 125         if self._full_dict is None:
 126             self._fill_dict()
 127         return self._full_dict
 128
 129
 130
 131     @property
 132     def dataframe(self):
 133         '''
 134         Returns the feature dataframe, after creating it if needed.
 135         '''
 136         if self._dataframe is None:
 137             logger.info("Creating feature dataframe from feature dictionary")
 138             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
 139                                                      orient='index')
 140             logger.info("Filling NaN values in the feature dataframe")
 141
 142             if self._config['PREPROCESSING']['fill_method'] == 'propagate':
 143                 self._dataframe = self._dataframe.fillna(method='ffill')
 144             elif self._config['PREPROCESSING']['fill_method'] == 'linear':
 145                 self._dataframe = self._dataframe.interpolate()
 146             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
 147                 self._dataframe = self._dataframe.interpolate(method='spline',
 148                                                               order=self._config['PREPROCESSING'].getint('order'))
 149
 150             # Uncomment this line to fill NaN values at the beginning of the
 151             # dataframe. This may not be a good idea, especially for features
 152             # that are available only for recent years, e.g., air quality
 153             #self._dataframe = self._dataframe.fillna(method='bfill')
 154
 155             self._dataframe = self._dataframe.drop([k.to_pydatetime()
 156                                                    for k in self._dataframe.T
 157                                                    if k not in self._datetimes])
 158         return self._dataframe
 159
 160
 161     @dataframe.setter
 162     def dataframe(self, df):
 163         self._dataframe = df
 164
 165