]> AND Private Git Repository - predictops.git/blob - predictops/learn/preprocessing.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
5400d1d39f1135ce5e2abcfec2541201cf5d8ed6
[predictops.git] / predictops / learn / preprocessing.py
1 from configparser import ConfigParser
2 from datetime import datetime, timedelta
3 from itertools import chain
4 from logging import getLogger
5 from logging.config import fileConfig
6 from pathlib import Path
7
8 import numpy as np
9 import pandas as pd
10
11 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
12 logger = getLogger()
13
14 class Preprocessing:
15     '''
16     Generate a pandas dataframe from a dictionary of features per datetime, which
17     respects the starting and ending dates of the study, and its precision (the
18     time step) as passed to the constructor. Missing feature values are completed.
19
20      - Missing datetimes are added first with np.NaN feature values,
21      - The dataframe is then constructed based on the filled feature dictionary,
22      - NaN values are then filled with last known values.
23     '''
24
25     def __init__(self, config_file = None, dict_features = None, features = None):
26         '''
27         Constructor that defines all needed attributes and collects features.
28         '''
29         self._config = ConfigParser()
30         self._config.read(config_file)
31
32         self._start = datetime.strptime(self._config['DATETIME']['start'],
33                                         '%m/%d/%Y %H:%M:%S')
34         self._end = datetime.strptime(self._config['DATETIME']['end'],
35                                         '%m/%d/%Y %H:%M:%S')
36         self._timestep = timedelta(hours =
37                                    self._config['DATETIME'].getfloat('hourStep'))
38         self._dict_features = dict_features
39         self._full_dict = None
40         self._dataframe = None
41         self._datetimes = []
42         # If features are not provided to the constructor, then we collect
43         # any existing feature in the dictionary
44         if features != None:
45             self._features = features
46         else:
47             self._features = set(chain.from_iterable([tuple(u.keys())
48                                                       for u in [*dict_features.values()]]))
49
50
51     @property
52     def start(self):
53         return self._start
54
55     @start.setter
56     def start(self, x):
57         self._start = x
58
59
60     @property
61     def end(self):
62         return self._end
63
64     @end.setter
65     def end(self, x):
66         self._end = x
67
68
69     @property
70     def timestep(self):
71         return self._timestep
72
73     @timestep.setter
74     def timestep(self, x):
75         self._timestep = x
76
77
78     def _fill_dict(self):
79         '''
80         Add datetime keys in the dated feature dictionary that are missing. The
81         features are then set to np.NaN. Add missing features in existing datetimes
82         too.
83         '''
84         logger.info("Adding missing dates and filling missing features with NaN values")
85         current = self._start
86         while current <= self._end:
87             self._datetimes.append(current)
88             if current not in self._dict_features:
89                 self._dict_features[current] = {feature:np.NaN
90                                                 for feature in self._features}
91             else:
92                 null_dict = {feature:np.NaN
93                              for feature in self._features}
94                 null_dict.update(self._dict_features[current])
95                 self._dict_features[current] = null_dict
96             current += self._timestep
97         for k in self._dict_features:
98             null_dict = {feature:np.NaN
99                          for feature in self._features}
100             null_dict.update(self._dict_features[k])
101             self._dict_features[k] = null_dict
102
103         self._full_dict = {k: self._dict_features[k]
104                            for k in sorted(self._dict_features.keys())}
105
106
107
108     @property
109     def full_dict(self):
110         '''
111         Returns the fully filled dated feature dictionary, ordered by datetimes
112         '''
113         if self._full_dict is None:
114             self._fill_dict()
115         return self._full_dict
116
117
118
119     @property
120     def dataframe(self):
121         '''
122         Returns the feature dataframe, after creating it if needed.
123         '''
124         if self._dataframe is None:
125             logger.info("Creating feature dataframe from feature dictionary")
126             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
127                                                      orient='index')
128             logger.info("Filling NaN values in the feature dataframe")
129
130             if self._config['PREPROCESSING']['fill_method'] == 'propagate':
131                 self._dataframe = self._dataframe.fillna(method='ffill')
132             elif self._config['PREPROCESSING']['fill_method'] == 'linear':
133                 self._dataframe = self._dataframe.interpolate()
134             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
135                 self._dataframe = self._dataframe.interpolate(method='spline',
136                                                               order=self._config['PREPROCESSING'].getint('order'))
137             self._dataframe = self._dataframe.fillna(method='bfill')
138
139             self._dataframe = self._dataframe.drop([k.to_pydatetime()
140                                                    for k in self._dataframe.T
141                                                    if k not in self._datetimes])
142         return self._dataframe
143
144
145     @dataframe.setter
146     def dataframe(self, df):
147         self._dataframe = df
148
149