1 from configparser import ConfigParser
2 from datetime import datetime, timedelta
3 from itertools import chain
4 from logging import getLogger
5 from logging.config import fileConfig
6 from pathlib import Path
11 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
16 Generate a pandas dataframe from a dictionary of features per datetime, which
17 respects the starting and ending dates of the study, and its precision (the
18 time step) as passed to the constructor. Missing feature values are completed.
20 - Missing datetimes are added first with np.NaN feature values,
21 - The dataframe is then constructed based on the filled feature dictionary,
22 - NaN values are then filled with last known values.
25 def __init__(self, config_file = None, dict_features = None, features = None):
27 Constructor that defines all needed attributes and collects features.
29 self._config = ConfigParser()
30 self._config.read(config_file)
32 self._start = datetime.strptime(self._config['DATETIME']['start'],
34 self._end = datetime.strptime(self._config['DATETIME']['end'],
36 self._timestep = timedelta(hours =
37 self._config['DATETIME'].getfloat('hourStep'))
38 self._dict_features = dict_features
39 self._full_dict = None
40 self._dataframe = None
42 # If features are not provided to the constructor, then we collect
43 # any existing feature in the dictionary
45 self._features = features
47 self._features = set(chain.from_iterable([tuple(u.keys())
48 for u in [*dict_features.values()]]))
74 def timestep(self, x):
80 Add datetime keys in the dated feature dictionary that are missing. The
81 features are then set to np.NaN. Add missing features in existing datetimes
84 logger.info("Adding missing dates and filling missing features with NaN values")
86 while current <= self._end:
87 self._datetimes.append(current)
88 if current not in self._dict_features:
89 self._dict_features[current] = {feature:np.NaN
90 for feature in self._features}
92 null_dict = {feature:np.NaN
93 for feature in self._features}
94 null_dict.update(self._dict_features[current])
95 self._dict_features[current] = null_dict
96 current += self._timestep
97 for k in self._dict_features:
98 null_dict = {feature:np.NaN
99 for feature in self._features}
100 null_dict.update(self._dict_features[k])
101 self._dict_features[k] = null_dict
103 self._full_dict = {k: self._dict_features[k]
104 for k in sorted(self._dict_features.keys())}
111 Returns the fully filled dated feature dictionary, ordered by datetimes
113 if self._full_dict is None:
115 return self._full_dict
122 Returns the feature dataframe, after creating it if needed.
124 if self._dataframe is None:
125 logger.info("Creating feature dataframe from feature dictionary")
126 self._dataframe = pd.DataFrame.from_dict(self.full_dict,
128 logger.info("Filling NaN values in the feature dataframe")
130 if self._config['PREPROCESSING']['fill_method'] == 'propagate':
131 self._dataframe = self._dataframe.fillna(method='ffill')
132 elif self._config['PREPROCESSING']['fill_method'] == 'linear':
133 self._dataframe = self._dataframe.interpolate()
134 elif self._config['PREPROCESSING']['fill_method'] == 'spline':
135 self._dataframe = self._dataframe.interpolate(method='spline',
136 order=self._config['PREPROCESSING'].getint('order'))
137 self._dataframe = self._dataframe.fillna(method='bfill')
139 self._dataframe = self._dataframe.drop([k.to_pydatetime()
140 for k in self._dataframe.T
141 if k not in self._datetimes])
142 return self._dataframe
146 def dataframe(self, df):