X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/blobdiff_plain/910a056eaa0181df00d21fa836f3c68504051717..d6469a787c80df2c938f21d4ae107b84213e238f:/predictops/learn/preprocessing.py diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index b58ffac..833e483 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -10,15 +10,32 @@ fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() class Preprocessing: + ''' + Generate a pandas dataframe from a dictionary of features per datetime, which + respects the starting and ending dates of the study, and its precision (the + time step) as passed to the constructor. Missing feature values are completed. + + - Missing datetimes are added first with np.NaN feature values, + - The dataframe is then constructed based on the filled feature dictionary, + - NaN values are then filled with last known values. + + ''' def __init__(self, dict_features, start, end, timestep, features = None): + ''' + Constructor that defines all needed attributes and collects features. + ''' + logger.info("Entering NaN values in the feature dataframe") self._dict_features = dict_features self._start = start self._end = end self._timestep = timestep + self._full_dict = None self._dataframe = None - + self._datetimes = [] + # If features are not provided to the constructor, then we collect + # any existing feature in the dictionary if features != None: self._features = features else: @@ -27,27 +44,62 @@ class Preprocessing: def _fill_dict(self): + ''' + Add datetime keys in the dated feature dictionary that are missing. The + features are then set to np.NaN. Add missing features in existing datetimes + too. + ''' + logger.info("Adding missing dates and filling missing features with NaN values") current = self._start while current <= self._end: + self._datetimes.append(current) if current not in self._dict_features: - self._dict_features[current] = {feature:np.NaN for feature in self._features} + self._dict_features[current] = {feature:np.NaN + for feature in self._features} else: - null_dict = {feature:np.NaN for feature in self._features} + null_dict = {feature:np.NaN + for feature in self._features} null_dict.update(self._dict_features[current]) self._dict_features[current] = null_dict current += self._timestep + for k in self._dict_features: + null_dict = {feature:np.NaN + for feature in self._features} + null_dict.update(self._dict_features[k]) + self._dict_features[k] = null_dict + + self._full_dict = {k: self._dict_features[k] + for k in sorted(self._dict_features.keys())} + @property def full_dict(self): - self._fill_dict() - return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())} + ''' + Returns the fully filled dated feature dictionary, ordered by datetimes + ''' + if self._full_dict is None: + self._fill_dict() + return self._full_dict + @property def dataframe(self): + ''' + Returns the feature dataframe, after creating it if needed. + ''' if self._dataframe is None: - self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index') + logger.info("Creating feature dataframe from feature dictionary") + self._dataframe = pd.DataFrame.from_dict(self.full_dict, + orient='index') + logger.info("Filling NaN values in the feature dataframe") + #TODO: add other filling methods like linear interpolation + self._dataframe = self._dataframe.fillna(method='ffill') + self._dataframe = self._dataframe.fillna(method='bfill') + self._dataframe = self._dataframe.drop([k.to_pydatetime() + for k in self._dataframe.T + if k not in self._datetimes]) return self._dataframe @dataframe.setter @@ -55,5 +107,3 @@ class Preprocessing: self._dataframe = df - def fill_na(self): - self.dataframe = self.dataframe.fillna(method='ffill') \ No newline at end of file