logger = getLogger()
class Preprocessing:
+ '''
+ Generate a pandas dataframe from a dictionary of features per datetime, which
+ respects the starting and ending dates of the study, and its precision (the
+ time step) as passed to the constructor. Missing feature values are completed.
+
+ - Missing datetimes are added first with np.NaN feature values,
+ - The dataframe is then constructed based on the filled feature dictionary,
+ - NaN values are then filled with last known values.
+
+ '''
def __init__(self, dict_features,
start, end, timestep,
features = None):
+ '''
+ Constructor that defines all needed attributes and collects features.
+ '''
+ logger.info("Entering NaN values in the feature dataframe")
self._dict_features = dict_features
self._start = start
self._end = end
self._timestep = timestep
+ self._full_dict = None
self._dataframe = None
-
+ self._datetimes = []
+ # If features are not provided to the constructor, then we collect
+ # any existing feature in the dictionary
if features != None:
self._features = features
else:
def _fill_dict(self):
+ '''
+ Add datetime keys in the dated feature dictionary that are missing. The
+ features are then set to np.NaN. Add missing features in existing datetimes
+ too.
+ '''
+ logger.info("Adding missing dates and filling missing features with NaN values")
current = self._start
while current <= self._end:
+ self._datetimes.append(current)
if current not in self._dict_features:
- self._dict_features[current] = {feature:np.NaN for feature in self._features}
+ self._dict_features[current] = {feature:np.NaN
+ for feature in self._features}
else:
- null_dict = {feature:np.NaN for feature in self._features}
+ null_dict = {feature:np.NaN
+ for feature in self._features}
null_dict.update(self._dict_features[current])
self._dict_features[current] = null_dict
current += self._timestep
+ for k in self._dict_features:
+ null_dict = {feature:np.NaN
+ for feature in self._features}
+ null_dict.update(self._dict_features[k])
+ self._dict_features[k] = null_dict
+
+ self._full_dict = {k: self._dict_features[k]
+ for k in sorted(self._dict_features.keys())}
+
@property
def full_dict(self):
- self._fill_dict()
- return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())}
+ '''
+ Returns the fully filled dated feature dictionary, ordered by datetimes
+ '''
+ if self._full_dict is None:
+ self._fill_dict()
+ return self._full_dict
+
@property
def dataframe(self):
+ '''
+ Returns the feature dataframe, after creating it if needed.
+ '''
if self._dataframe is None:
- self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index')
+ logger.info("Creating feature dataframe from feature dictionary")
+ self._dataframe = pd.DataFrame.from_dict(self.full_dict,
+ orient='index')
+ logger.info("Filling NaN values in the feature dataframe")
+ #TODO: add other filling methods like linear interpolation
+ self._dataframe = self._dataframe.fillna(method='ffill')
+ self._dataframe = self._dataframe.fillna(method='bfill')
+ self._dataframe = self._dataframe.drop([k.to_pydatetime()
+ for k in self._dataframe.T
+ if k not in self._datetimes])
return self._dataframe
@dataframe.setter
self._dataframe = df
- def fill_na(self):
- self.dataframe = self.dataframe.fillna(method='ffill')
\ No newline at end of file