]> AND Private Git Repository - predictops.git/blob - predictops/learn/preprocessing.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Adding ramadan features, and binary category of feat.
[predictops.git] / predictops / learn / preprocessing.py
1 from configparser import ConfigParser
2 from csv import DictReader
3 from datetime import datetime, timedelta
4 from itertools import chain
5 from logging import getLogger
6 from logging.config import fileConfig
7 from os import listdir
8 from pathlib import Path
9 from sklearn import preprocessing
10
11 import numpy as np
12 import pandas as pd
13
14 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
15 logger = getLogger()
16
17
18 class Preprocessing:
19     '''
20     Generate a pandas dataframe from a dictionary of features per datetime, which
21     respects the starting and ending dates of the study, and its precision (the
22     time step) as passed to the constructor. Missing feature values are completed.
23
24      - Missing datetimes are added first with np.NaN feature values,
25      - The dataframe is then constructed based on the filled feature dictionary,
26      - NaN values are then filled with last known values.
27     '''
28
29     def __init__(self, config_file=None,
30                  dict_features=None, dict_target=None):
31         '''
32         Constructor that defines all needed attributes and collects features.
33         '''
34         self._config = config_file
35
36         self._start = datetime.strptime(self._config['DATETIME']['start'],
37                                         '%m/%d/%Y %H:%M:%S')
38         self._end = datetime.strptime(self._config['DATETIME']['end'],
39                                       '%m/%d/%Y %H:%M:%S')
40         self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep'))
41         self._dict_features = dict_features
42         self._dict_target = dict_target
43
44         self._full_dict = None
45         self._dataframe = None
46         self._datetimes = []
47
48         self._features = set(chain.from_iterable([tuple(u.keys())
49                                                   for u in [*dict_features.values()]]))
50
51         #feature_files = Path.cwd() / 'config' / 'features'
52         self._features = {feat: {'numerical': False, 'categorical': False}
53                           for feat in self._features}
54
55         for feature in self._config['FEATURES']:
56             if self._config['FEATURES'][feature]:
57                 feature_file = self._config['FEATURE_CONFIG'][feature]
58                 config = ConfigParser()
59                 config.read(eval(feature_file))
60                 for section in config:
61                     if config.has_option(section, 'numerical'):
62                         for feature in self._features:
63                             if feature.split('_')[0] == section:
64                                 self._features[feature]['binary'] = config[section].getboolean('binary')
65                                 self._features[feature]['categorical'] = config[section].getboolean('categorical')
66                                 self._features[feature]['numerical'] = config[section].getboolean('numerical')
67
68         self._binary_columns = [k for k in self._features if self._features[k]['binary']]
69         self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
70         self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
71
72     @property
73     def start(self):
74         return self._start
75
76     @start.setter
77     def start(self, x):
78         self._start = x
79
80     @property
81     def end(self):
82         return self._end
83
84     @end.setter
85     def end(self, x):
86         self._end = x
87
88     @property
89     def timestep(self):
90         return self._timestep
91
92     @timestep.setter
93     def timestep(self, x):
94         self._timestep = x
95
96     def _fill_dict(self):
97         '''
98         Add datetime keys in the dated feature dictionary that are missing. The
99         features are then set to np.NaN. Add missing features in existing datetimes
100         too.
101         '''
102         logger.info("Adding missing dates and filling missing features with NaN values")
103         current = self._start
104         while current <= self._end:
105             self._datetimes.append(current)
106             if current not in self._dict_features:
107                 self._dict_features[current] = {feature: np.NaN
108                                                 for feature in self._features}
109             else:
110                 null_dict = {feature: np.NaN
111                              for feature in self._features}
112                 null_dict.update(self._dict_features[current])
113                 self._dict_features[current] = null_dict
114             current += self._timestep
115         for k in self._dict_features:
116             null_dict = {feature: np.NaN
117                          for feature in self._features}
118             null_dict.update(self._dict_features[k])
119             self._dict_features[k] = null_dict
120
121         self._full_dict = {k: self._dict_features[k]
122                            for k in sorted(self._dict_features.keys())}
123
124     @property
125     def full_dict(self):
126         '''
127         Returns the fully filled dated feature dictionary, ordered by datetimes
128         '''
129         if self._full_dict is None:
130             self._fill_dict()
131         return self._full_dict
132
133     def _fill_nan(self):
134         '''
135         Fill NaN values, either by propagation or by interpolation (linear or splines)
136         '''
137         logger.info("Filling NaN numerical values in the feature dataframe")
138         # We interpolate (linearly or with splines) only numerical columns
139         # The interpolation
140         if self._config['PREPROCESSING']['fill_method'] == 'propagate':
141             self._dataframe[self._numerical_columns] =\
142                 self._dataframe[self._numerical_columns].fillna(method='ffill')
143         elif self._config['PREPROCESSING']['fill_method'] == 'linear':
144             self._dataframe[self._numerical_columns] =\
145                 self._dataframe[self._numerical_columns].interpolate()
146         elif self._config['PREPROCESSING']['fill_method'] == 'spline':
147             self._dataframe[self._numerical_columns] =\
148                 self._dataframe[self._numerical_columns].interpolate(method='spline',
149                                                                      order=self._config['PREPROCESSING'].getint('order'))
150
151         # For the categorical columns, NaN values are filled by duplicating
152         # the last known value (forward fill method)
153         logger.info("Filling NaN categorical values in the feature dataframe")
154         self._dataframe[self._categorical_columns] =\
155             self._dataframe[self._categorical_columns].fillna(method='ffill')
156
157         # Uncomment this line to fill NaN values at the beginning of the
158         # dataframe. This may not be a good idea, especially for features
159         # that are available only for recent years, e.g., air quality
160         #self._dataframe = self._dataframe.fillna(method='bfill')
161
162         # Dropping rows that are not related to our datetime window (start/
163         # step / end)
164         logger.info("Dropping rows that are not related to our datetime window")
165         dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
166         self._dataframe['row_ok'] =\
167             self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
168         self._dataframe = self._dataframe[self._dataframe['row_ok']]
169         self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
170         logger.info("Rows dropped")
171
172     def _add_history(self):
173         '''
174         Integrating previous nb of interventions as features
175         '''
176         logger.info("Integrating previous nb of interventions as features")
177         nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
178         for k in range(1, nb_lines + 1):
179             name = 'history_' + str(nb_lines - k + 1)
180             self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
181             self._numerical_columns.append(name)
182         self._dataframe = self._dataframe[nb_lines:]
183
184     def _standardize(self):
185         '''
186         Normalizing numerical features
187         '''
188         logger.info("Standardizing numerical values in the feature dataframe")
189         # We operate only on numerical columns
190         self._dataframe[self._numerical_columns] =\
191             preprocessing.scale(self._dataframe[self._numerical_columns])
192
193     def _one_hot_encoding(self):
194         '''
195         Apply a one hot encoding for category features
196         '''
197         logger.info("One hot encoding for categorical feature")
198         # We store numerical columns
199         df_out = pd.DataFrame()
200         for col in self._numerical_columns:
201             df_out[col] = self._dataframe[col]
202         # Idem for binary features
203         for col in self._binary_columns:
204             df_out[col] = self._dataframe[col]
205         # The one hot encoding
206         for col in self._categorical_columns:
207             pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
208             for col1 in pd1.columns:
209                 df_out[col1] = pd1[col1]
210         self._dataframe = df_out
211
212     @property
213     def dataframe(self):
214         '''
215         Returns the feature dataframe, after creating it if needed.
216         '''
217         if self._dataframe is None:
218             logger.info("Creating feature dataframe from feature dictionary")
219             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
220                                                      orient='index')
221             # Dealing with NaN values
222             self._fill_nan()
223             # Adding previous (historical) nb_interventions as features
224             self._add_history()
225             # Normalizing numerical values
226             self._standardize()
227             # Dealing with categorical features
228             self._one_hot_encoding()
229         return self._dataframe
230
231     @dataframe.setter
232     def dataframe(self, df):
233         self._dataframe = df
234
235