1 from configparser import ConfigParser
2 from itertools import chain
3 from logging import getLogger
4 from logging.config import fileConfig
5 from pathlib import Path
6 from sklearn import preprocessing
11 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
17 Generate a pandas dataframe from a dictionary of features per datetime, which
18 respects the starting and ending dates of the study, and its precision (the
19 time step) as passed to the constructor. Missing feature values are completed.
21 - Missing datetimes are added first with np.NaN feature values,
22 - The dataframe is then constructed based on the filled feature dictionary,
23 - NaN values are then filled with last known values.
26 def __init__(self, config_file=None,
27 start=None, end=None, timestep=None,
28 dict_features=None, dict_target=None):
30 Constructor that defines all needed attributes and collects features.
32 self._config = config_file
36 self._timestep = timestep
37 self._dict_features = dict_features
38 self._dict_target = dict_target
40 self._full_dict = None
41 self._dataframe = None
44 self._features = set(chain.from_iterable([tuple(u.keys())
45 for u in [*dict_features.values()]]))
47 #feature_files = Path.cwd() / 'config' / 'features'
48 self._features = {feat: {'numerical': False, 'categorical': False}
49 for feat in self._features}
51 for feature in self._config['FEATURES']:
52 if self._config['FEATURES'][feature]:
53 feature_file = self._config['FEATURE_CONFIG'][feature]
54 config = ConfigParser()
55 config.read(eval(feature_file))
56 for section in config:
57 if config.has_option(section, 'numerical'):
58 for feature in self._features:
59 if feature.split('_')[0] == section:
60 self._features[feature]['binary'] = config[section].getboolean('binary')
61 self._features[feature]['categorical'] = config[section].getboolean('categorical')
62 self._features[feature]['numerical'] = config[section].getboolean('numerical')
64 self._binary_columns = [k for k in self._features if self._features[k]['binary']]
65 self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
66 self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
89 def timestep(self, x):
94 Add datetime keys in the dated feature dictionary that are missing. The
95 features are then set to np.NaN. Add missing features in existing datetimes
98 logger.info("Adding missing dates and filling missing features with NaN values")
100 while current <= self._end:
101 self._datetimes.append(current)
102 if current not in self._dict_features:
103 self._dict_features[current] = {feature: np.NaN
104 for feature in self._features}
106 null_dict = {feature: np.NaN
107 for feature in self._features}
108 null_dict.update(self._dict_features[current])
109 self._dict_features[current] = null_dict
110 current += self._timestep
111 for k in self._dict_features:
112 null_dict = {feature: np.NaN
113 for feature in self._features}
114 null_dict.update(self._dict_features[k])
115 self._dict_features[k] = null_dict
117 self._full_dict = {k: self._dict_features[k]
118 for k in sorted(self._dict_features.keys())}
123 Returns the fully filled dated feature dictionary, ordered by datetimes
125 if self._full_dict is None:
127 return self._full_dict
131 Fill NaN values, either by propagation or by interpolation (linear or splines)
133 logger.info("Filling NaN numerical values in the feature dataframe")
134 # We interpolate (linearly or with splines) only numerical columns
136 if self._config['PREPROCESSING']['fill_method'] == 'propagate':
137 self._dataframe[self._numerical_columns] =\
138 self._dataframe[self._numerical_columns].fillna(method='ffill')
139 elif self._config['PREPROCESSING']['fill_method'] == 'linear':
140 self._dataframe[self._numerical_columns] =\
141 self._dataframe[self._numerical_columns].interpolate()
142 elif self._config['PREPROCESSING']['fill_method'] == 'spline':
143 self._dataframe[self._numerical_columns] =\
144 self._dataframe[self._numerical_columns].interpolate(method='spline',
145 order=self._config['PREPROCESSING'].getint('order'))
147 # For the categorical columns, NaN values are filled by duplicating
148 # the last known value (forward fill method)
149 logger.info("Filling NaN categorical values in the feature dataframe")
150 self._dataframe[self._categorical_columns] =\
151 self._dataframe[self._categorical_columns].fillna(method='ffill')
153 # Uncomment this line to fill NaN values at the beginning of the
154 # dataframe. This may not be a good idea, especially for features
155 # that are available only for recent years, e.g., air quality
156 #self._dataframe = self._dataframe.fillna(method='bfill')
158 # Dropping rows that are not related to our datetime window (start/
160 logger.info("Dropping rows that are not related to our datetime window")
161 dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
162 self._dataframe['row_ok'] =\
163 self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
164 self._dataframe = self._dataframe[self._dataframe['row_ok']]
165 self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
166 logger.info("Rows dropped")
168 def _add_history(self):
170 Integrating previous nb of interventions as features
172 logger.info("Integrating previous nb of interventions as features")
173 nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
174 for k in range(1, nb_lines + 1):
175 name = 'history_' + str(nb_lines - k + 1)
176 self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
177 self._numerical_columns.append(name)
178 self._dataframe = self._dataframe[nb_lines:]
180 def _standardize(self):
182 Normalizing numerical features
184 logger.info("Standardizing numerical values in the feature dataframe")
185 # We operate only on numerical columns
186 self._dataframe[self._numerical_columns] =\
187 preprocessing.scale(self._dataframe[self._numerical_columns])
189 def _one_hot_encoding(self):
191 Apply a one hot encoding for category features
193 logger.info("One hot encoding for categorical feature")
194 # We store numerical columns
196 df_out = pd.DataFrame()
197 for col in self._numerical_columns:
198 df_out[col] = self._dataframe[col]
199 # Idem for binary features
200 for col in self._binary_columns:
201 df_out[col] = self._dataframe[col]
202 # The one hot encoding
203 for col in self._categorical_columns:
204 pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
205 for col1 in pd1.columns:
206 df_out[col1] = pd1[col1]
207 self._dataframe = df_out
212 Returns the feature dataframe, after creating it if needed.
214 if self._dataframe is None:
215 logger.info("Creating feature dataframe from feature dictionary")
216 self._dataframe = pd.DataFrame.from_dict(self.full_dict,
218 # Dealing with NaN values
220 # Adding previous (historical) nb_interventions as features
222 # self._dataframe.to_csv('toto.csv')
224 # Normalizing numerical values
226 # Dealing with categorical features
227 self._one_hot_encoding()
228 return self._dataframe
231 def dataframe(self, df):