1 from configparser import ConfigParser
2 from csv import DictReader
3 from datetime import datetime, timedelta
4 from itertools import chain
5 from logging import getLogger
6 from logging.config import fileConfig
8 from pathlib import Path
13 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
18 Generate a pandas dataframe from a dictionary of features per datetime, which
19 respects the starting and ending dates of the study, and its precision (the
20 time step) as passed to the constructor. Missing feature values are completed.
22 - Missing datetimes are added first with np.NaN feature values,
23 - The dataframe is then constructed based on the filled feature dictionary,
24 - NaN values are then filled with last known values.
27 def __init__(self, config_file = None, dict_features = None, features = None):
29 Constructor that defines all needed attributes and collects features.
31 self._config = ConfigParser()
32 self._config.read(config_file)
34 self._start = datetime.strptime(self._config['DATETIME']['start'],
36 self._end = datetime.strptime(self._config['DATETIME']['end'],
38 self._timestep = timedelta(hours =
39 self._config['DATETIME'].getfloat('hourStep'))
40 self._dict_features = dict_features
41 self._full_dict = None
42 self._dataframe = None
44 # If features are not provided to the constructor, then we collect
45 # any existing feature in the dictionary
47 self._features = features
49 self._features = set(chain.from_iterable([tuple(u.keys())
50 for u in [*dict_features.values()]]))
51 csv_files = Path.cwd() / 'config' / 'features'
52 self._features = {feat : None for feat in self._features}
53 for csv_file in listdir(csv_files):
54 with open(csv_files / csv_file, "r") as f:
55 reader = DictReader(f, delimiter=',')
57 if row['name'] in self._features:
58 self._features[row['name']] = row['type']
86 def timestep(self, x):
92 Add datetime keys in the dated feature dictionary that are missing. The
93 features are then set to np.NaN. Add missing features in existing datetimes
96 logger.info("Adding missing dates and filling missing features with NaN values")
98 while current <= self._end:
99 self._datetimes.append(current)
100 if current not in self._dict_features:
101 self._dict_features[current] = {feature:np.NaN
102 for feature in self._features}
104 null_dict = {feature:np.NaN
105 for feature in self._features}
106 null_dict.update(self._dict_features[current])
107 self._dict_features[current] = null_dict
108 current += self._timestep
109 for k in self._dict_features:
110 null_dict = {feature:np.NaN
111 for feature in self._features}
112 null_dict.update(self._dict_features[k])
113 self._dict_features[k] = null_dict
115 self._full_dict = {k: self._dict_features[k]
116 for k in sorted(self._dict_features.keys())}
123 Returns the fully filled dated feature dictionary, ordered by datetimes
125 if self._full_dict is None:
127 return self._full_dict
134 Returns the feature dataframe, after creating it if needed.
136 if self._dataframe is None:
137 logger.info("Creating feature dataframe from feature dictionary")
138 self._dataframe = pd.DataFrame.from_dict(self.full_dict,
140 logger.info("Filling NaN values in the feature dataframe")
142 if self._config['PREPROCESSING']['fill_method'] == 'propagate':
143 self._dataframe = self._dataframe.fillna(method='ffill')
144 elif self._config['PREPROCESSING']['fill_method'] == 'linear':
145 self._dataframe = self._dataframe.interpolate()
146 elif self._config['PREPROCESSING']['fill_method'] == 'spline':
147 self._dataframe = self._dataframe.interpolate(method='spline',
148 order=self._config['PREPROCESSING'].getint('order'))
150 # Uncomment this line to fill NaN values at the beginning of the
151 # dataframe. This may not be a good idea, especially for features
152 # that are available only for recent years, e.g., air quality
153 #self._dataframe = self._dataframe.fillna(method='bfill')
155 self._dataframe = self._dataframe.drop([k.to_pydatetime()
156 for k in self._dataframe.T
157 if k not in self._datetimes])
158 return self._dataframe
162 def dataframe(self, df):