1 from configparser import ConfigParser
2 from csv import DictReader
3 from datetime import datetime, timedelta
4 from itertools import chain
5 from logging import getLogger
6 from logging.config import fileConfig
8 from pathlib import Path
9 from sklearn import preprocessing
14 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
19 Generate a pandas dataframe from a dictionary of features per datetime, which
20 respects the starting and ending dates of the study, and its precision (the
21 time step) as passed to the constructor. Missing feature values are completed.
23 - Missing datetimes are added first with np.NaN feature values,
24 - The dataframe is then constructed based on the filled feature dictionary,
25 - NaN values are then filled with last known values.
28 def __init__(self, config_file = None, dict_features = None, features = None):
30 Constructor that defines all needed attributes and collects features.
32 self._config = ConfigParser()
33 self._config.read(config_file)
35 self._start = datetime.strptime(self._config['DATETIME']['start'],
37 self._end = datetime.strptime(self._config['DATETIME']['end'],
39 self._timestep = timedelta(hours =
40 self._config['DATETIME'].getfloat('hourStep'))
41 self._dict_features = dict_features
42 self._full_dict = None
43 self._dataframe = None
45 # If features are not provided to the constructor, then we collect
46 # any existing feature in the dictionary
48 self._features = features
50 self._features = set(chain.from_iterable([tuple(u.keys())
51 for u in [*dict_features.values()]]))
52 feature_files = Path.cwd() / 'config' / 'features'
53 self._features = {feat : {'numerical': False} for feat in self._features}
54 for feature_file in listdir(feature_files):
55 if feature_file.endswith('csv'):
56 with open(feature_files / feature_file , "r") as f:
57 reader = DictReader(f, delimiter=',')
58 typed_names = {row['name']: row['type'] for row in reader}
59 for feature in self._features:
60 if feature.split('_')[0] in typed_names:
61 self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
62 elif feature_file.endswith('cfg'):
63 config = ConfigParser()
64 config.read(feature_files / feature_file)
65 for section in config:
66 if config.has_option(section, 'numerical'):
67 self._features[section]['numerical'] = config[section].getboolean('numerical')
94 def timestep(self, x):
100 Add datetime keys in the dated feature dictionary that are missing. The
101 features are then set to np.NaN. Add missing features in existing datetimes
104 logger.info("Adding missing dates and filling missing features with NaN values")
105 current = self._start
106 while current <= self._end:
107 self._datetimes.append(current)
108 if current not in self._dict_features:
109 self._dict_features[current] = {feature:np.NaN
110 for feature in self._features}
112 null_dict = {feature:np.NaN
113 for feature in self._features}
114 null_dict.update(self._dict_features[current])
115 self._dict_features[current] = null_dict
116 current += self._timestep
117 for k in self._dict_features:
118 null_dict = {feature:np.NaN
119 for feature in self._features}
120 null_dict.update(self._dict_features[k])
121 self._dict_features[k] = null_dict
123 self._full_dict = {k: self._dict_features[k]
124 for k in sorted(self._dict_features.keys())}
131 Returns the fully filled dated feature dictionary, ordered by datetimes
133 if self._full_dict is None:
135 return self._full_dict
140 Fill NaN values, either by propagation or by interpolation (linear or splines)
142 logger.info("Filling NaN numerical values in the feature dataframe")
143 # We interpolate (linearly or with splines) only numerical columns
144 numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
145 or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
147 if self._config['PREPROCESSING']['fill_method'] == 'propagate':
148 self._dataframe[numerical_columns] =\
149 self._dataframe[numerical_columns].fillna(method='ffill')
150 elif self._config['PREPROCESSING']['fill_method'] == 'linear':
151 self._dataframe[numerical_columns] =\
152 self._dataframe[numerical_columns].interpolate()
153 elif self._config['PREPROCESSING']['fill_method'] == 'spline':
154 self._dataframe[numerical_columns] =\
155 self._dataframe[numerical_columns].interpolate(method='spline',
156 order=self._config['PREPROCESSING'].getint('order'))
158 # For the categorical columns, NaN values are filled by duplicating
159 # the last known value (forward fill method)
160 logger.info("Filling NaN categorical values in the feature dataframe")
161 categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
162 or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
163 self._dataframe[categorical_columns] =\
164 self._dataframe[categorical_columns].fillna(method='ffill')
166 # Uncomment this line to fill NaN values at the beginning of the
167 # dataframe. This may not be a good idea, especially for features
168 # that are available only for recent years, e.g., air quality
169 #self._dataframe = self._dataframe.fillna(method='bfill')
171 # Dropping rows that are not related to our datetime window (start/
173 self._dataframe = self._dataframe.drop([k.to_pydatetime()
174 for k in self._dataframe.T
175 if k not in self._datetimes])
178 def _standardize(self):
180 Normalizing numerical features
182 logger.info("Standardizing numerical values in the feature dataframe")
183 # We operate only on numerical columns
184 numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
185 or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
186 self._dataframe[numerical_columns] = preprocessing.scale(self._dataframe[numerical_columns])
189 def _one_hot_encoding(self):
191 Apply a one hot encoding for category features
193 logger.info("One hot encoding for categorical feature")
194 categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
195 or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
197 # On fait un codage disjonctif complet des variables qualitatives
198 df_out = pd.DataFrame()
199 for col in categorical_columns:
200 pd1 = pd.get_dummies(self._dataframe[col],prefix=col)
201 for col1 in pd1.columns:
202 df_out[col1] = pd1[col1]
203 self._dataframe = df_out
204 print(self._dataframe.head())
211 Returns the feature dataframe, after creating it if needed.
213 if self._dataframe is None:
214 logger.info("Creating feature dataframe from feature dictionary")
215 self._dataframe = pd.DataFrame.from_dict(self.full_dict,
217 # Dealing with NaN values
219 # Normalizing numerical values
221 # Dealing with categorical features
222 self._one_hot_encoding()
223 return self._dataframe
227 def dataframe(self, df):