1 from configparser import ConfigParser
2 from csv import DictReader
3 from datetime import datetime, timedelta
4 from itertools import chain
5 from logging import getLogger
6 from logging.config import fileConfig
8 from pathlib import Path
9 from sklearn import preprocessing
14 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
20 Generate a pandas dataframe from a dictionary of features per datetime, which
21 respects the starting and ending dates of the study, and its precision (the
22 time step) as passed to the constructor. Missing feature values are completed.
24 - Missing datetimes are added first with np.NaN feature values,
25 - The dataframe is then constructed based on the filled feature dictionary,
26 - NaN values are then filled with last known values.
29 def __init__(self, config_file=None,
30 dict_features=None, dict_target=None):
32 Constructor that defines all needed attributes and collects features.
34 self._config = config_file
36 self._start = datetime.strptime(self._config['DATETIME']['start'],
38 self._end = datetime.strptime(self._config['DATETIME']['end'],
40 self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep'))
41 self._dict_features = dict_features
42 self._dict_target = dict_target
44 self._full_dict = None
45 self._dataframe = None
48 self._features = set(chain.from_iterable([tuple(u.keys())
49 for u in [*dict_features.values()]]))
51 #feature_files = Path.cwd() / 'config' / 'features'
52 self._features = {feat: {'numerical': False, 'categorical': False}
53 for feat in self._features}
55 for feature in self._config['FEATURES']:
56 if self._config['FEATURES'][feature]:
57 feature_file = self._config['FEATURE_CONFIG'][feature]
58 config = ConfigParser()
59 config.read(eval(feature_file))
60 for section in config:
61 if config.has_option(section, 'numerical'):
62 for feature in self._features:
63 if feature.split('_')[0] == section:
64 self._features[feature]['binary'] = config[section].getboolean('binary')
65 self._features[feature]['categorical'] = config[section].getboolean('categorical')
66 self._features[feature]['numerical'] = config[section].getboolean('numerical')
68 self._binary_columns = [k for k in self._features if self._features[k]['binary']]
69 self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
70 self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
93 def timestep(self, x):
98 Add datetime keys in the dated feature dictionary that are missing. The
99 features are then set to np.NaN. Add missing features in existing datetimes
102 logger.info("Adding missing dates and filling missing features with NaN values")
103 current = self._start
104 while current <= self._end:
105 self._datetimes.append(current)
106 if current not in self._dict_features:
107 self._dict_features[current] = {feature: np.NaN
108 for feature in self._features}
110 null_dict = {feature: np.NaN
111 for feature in self._features}
112 null_dict.update(self._dict_features[current])
113 self._dict_features[current] = null_dict
114 current += self._timestep
115 for k in self._dict_features:
116 null_dict = {feature: np.NaN
117 for feature in self._features}
118 null_dict.update(self._dict_features[k])
119 self._dict_features[k] = null_dict
121 self._full_dict = {k: self._dict_features[k]
122 for k in sorted(self._dict_features.keys())}
127 Returns the fully filled dated feature dictionary, ordered by datetimes
129 if self._full_dict is None:
131 return self._full_dict
135 Fill NaN values, either by propagation or by interpolation (linear or splines)
137 logger.info("Filling NaN numerical values in the feature dataframe")
138 # We interpolate (linearly or with splines) only numerical columns
140 if self._config['PREPROCESSING']['fill_method'] == 'propagate':
141 self._dataframe[self._numerical_columns] =\
142 self._dataframe[self._numerical_columns].fillna(method='ffill')
143 elif self._config['PREPROCESSING']['fill_method'] == 'linear':
144 self._dataframe[self._numerical_columns] =\
145 self._dataframe[self._numerical_columns].interpolate()
146 elif self._config['PREPROCESSING']['fill_method'] == 'spline':
147 self._dataframe[self._numerical_columns] =\
148 self._dataframe[self._numerical_columns].interpolate(method='spline',
149 order=self._config['PREPROCESSING'].getint('order'))
151 # For the categorical columns, NaN values are filled by duplicating
152 # the last known value (forward fill method)
153 logger.info("Filling NaN categorical values in the feature dataframe")
154 self._dataframe[self._categorical_columns] =\
155 self._dataframe[self._categorical_columns].fillna(method='ffill')
157 # Uncomment this line to fill NaN values at the beginning of the
158 # dataframe. This may not be a good idea, especially for features
159 # that are available only for recent years, e.g., air quality
160 #self._dataframe = self._dataframe.fillna(method='bfill')
162 # Dropping rows that are not related to our datetime window (start/
164 logger.info("Dropping rows that are not related to our datetime window")
165 dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
166 self._dataframe['row_ok'] =\
167 self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
168 self._dataframe = self._dataframe[self._dataframe['row_ok']]
169 self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
170 logger.info("Rows dropped")
172 def _add_history(self):
174 Integrating previous nb of interventions as features
176 logger.info("Integrating previous nb of interventions as features")
177 nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
178 for k in range(1, nb_lines + 1):
179 name = 'history_' + str(nb_lines - k + 1)
180 self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
181 self._numerical_columns.append(name)
182 self._dataframe = self._dataframe[nb_lines:]
184 def _standardize(self):
186 Normalizing numerical features
188 logger.info("Standardizing numerical values in the feature dataframe")
189 # We operate only on numerical columns
190 self._dataframe[self._numerical_columns] =\
191 preprocessing.scale(self._dataframe[self._numerical_columns])
193 def _one_hot_encoding(self):
195 Apply a one hot encoding for category features
197 logger.info("One hot encoding for categorical feature")
198 # We store numerical columns
199 df_out = pd.DataFrame()
200 for col in self._numerical_columns:
201 df_out[col] = self._dataframe[col]
202 # Idem for binary features
203 for col in self._binary_columns:
204 df_out[col] = self._dataframe[col]
205 # The one hot encoding
206 for col in self._categorical_columns:
207 pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
208 for col1 in pd1.columns:
209 df_out[col1] = pd1[col1]
210 self._dataframe = df_out
215 Returns the feature dataframe, after creating it if needed.
217 if self._dataframe is None:
218 logger.info("Creating feature dataframe from feature dictionary")
219 self._dataframe = pd.DataFrame.from_dict(self.full_dict,
221 # Dealing with NaN values
223 # Adding previous (historical) nb_interventions as features
225 # Normalizing numerical values
227 # Dealing with categorical features
228 self._one_hot_encoding()
229 return self._dataframe
232 def dataframe(self, df):