X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/blobdiff_plain/288baa6ff06c1b815ec24d164770acc93ac80499..refs/heads/master:/predictops/learn/preprocessing.py diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index 51ecb4e..55cffbd 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -1,10 +1,7 @@ from configparser import ConfigParser -from csv import DictReader -from datetime import datetime, timedelta from itertools import chain from logging import getLogger from logging.config import fileConfig -from os import listdir from pathlib import Path from sklearn import preprocessing @@ -14,6 +11,7 @@ import pandas as pd fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() + class Preprocessing: ''' Generate a pandas dataframe from a dictionary of features per datetime, which @@ -25,19 +23,17 @@ class Preprocessing: - NaN values are then filled with last known values. ''' - def __init__(self, config_file = None, - dict_features = None, dict_target = None): + def __init__(self, config_file=None, + start=None, end=None, timestep=None, + dict_features=None, dict_target=None): ''' Constructor that defines all needed attributes and collects features. ''' self._config = config_file - self._start = datetime.strptime(self._config['DATETIME']['start'], - '%m/%d/%Y %H:%M:%S') - self._end = datetime.strptime(self._config['DATETIME']['end'], - '%m/%d/%Y %H:%M:%S') - self._timestep = timedelta(hours = - self._config['DATETIME'].getfloat('hourStep')) + self._start = start + self._end = end + self._timestep = timestep self._dict_features = dict_features self._dict_target = dict_target @@ -46,32 +42,28 @@ class Preprocessing: self._datetimes = [] self._features = set(chain.from_iterable([tuple(u.keys()) - for u in [*dict_features.values()]])) - - feature_files = Path.cwd() / 'config' / 'features' - self._features = {feat : {'numerical': False} for feat in self._features} - for feature_file in listdir(feature_files): - if feature_file.endswith('csv'): - with open(feature_files / feature_file , "r") as f: - reader = DictReader(f, delimiter=',') - typed_names = {row['name']: row['type'] for row in reader} - for feature in self._features: - if feature.split('_')[0] in typed_names: - self._features[feature]['type'] = int(typed_names[feature.split('_')[0]]) - elif feature_file.endswith('cfg'): + for u in [*dict_features.values()]])) + + #feature_files = Path.cwd() / 'config' / 'features' + self._features = {feat: {'numerical': False, 'categorical': False} + for feat in self._features} + + for feature in self._config['FEATURES']: + if self._config['FEATURES'][feature]: + feature_file = self._config['FEATURE_CONFIG'][feature] config = ConfigParser() - config.read(feature_files / feature_file) + config.read(eval(feature_file)) for section in config: if config.has_option(section, 'numerical'): - self._features[section]['numerical'] = config[section].getboolean('numerical') - - self._numerical_columns = [k for k in self._features if self._features[k]['type'] == 1 - or (self._features[k]['type'] == 3 and self._features[k]['numerical'])] - - self._categorical_columns = [k for k in self._features if self._features[k]['type'] == 2 - or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])] - + for feature in self._features: + if feature.split('_')[0] == section: + self._features[feature]['binary'] = config[section].getboolean('binary') + self._features[feature]['categorical'] = config[section].getboolean('categorical') + self._features[feature]['numerical'] = config[section].getboolean('numerical') + self._binary_columns = [k for k in self._features if self._features[k]['binary']] + self._categorical_columns = [k for k in self._features if self._features[k]['categorical']] + self._numerical_columns = [k for k in self._features if self._features[k]['numerical']] @property def start(self): @@ -81,7 +73,6 @@ class Preprocessing: def start(self, x): self._start = x - @property def end(self): return self._end @@ -90,7 +81,6 @@ class Preprocessing: def end(self, x): self._end = x - @property def timestep(self): return self._timestep @@ -99,7 +89,6 @@ class Preprocessing: def timestep(self, x): self._timestep = x - def _fill_dict(self): ''' Add datetime keys in the dated feature dictionary that are missing. The @@ -111,16 +100,16 @@ class Preprocessing: while current <= self._end: self._datetimes.append(current) if current not in self._dict_features: - self._dict_features[current] = {feature:np.NaN + self._dict_features[current] = {feature: np.NaN for feature in self._features} else: - null_dict = {feature:np.NaN + null_dict = {feature: np.NaN for feature in self._features} null_dict.update(self._dict_features[current]) self._dict_features[current] = null_dict current += self._timestep for k in self._dict_features: - null_dict = {feature:np.NaN + null_dict = {feature: np.NaN for feature in self._features} null_dict.update(self._dict_features[k]) self._dict_features[k] = null_dict @@ -128,8 +117,6 @@ class Preprocessing: self._full_dict = {k: self._dict_features[k] for k in sorted(self._dict_features.keys())} - - @property def full_dict(self): ''' @@ -139,7 +126,6 @@ class Preprocessing: self._fill_dict() return self._full_dict - def _fill_nan(self): ''' Fill NaN values, either by propagation or by interpolation (linear or splines) @@ -156,7 +142,7 @@ class Preprocessing: elif self._config['PREPROCESSING']['fill_method'] == 'spline': self._dataframe[self._numerical_columns] =\ self._dataframe[self._numerical_columns].interpolate(method='spline', - order=self._config['PREPROCESSING'].getint('order')) + order=self._config['PREPROCESSING'].getint('order')) # For the categorical columns, NaN values are filled by duplicating # the last known value (forward fill method) @@ -171,25 +157,26 @@ class Preprocessing: # Dropping rows that are not related to our datetime window (start/ # step / end) - self._dataframe = self._dataframe.drop([k.to_pydatetime() - for k in self._dataframe.T - if k not in self._datetimes]) - + logger.info("Dropping rows that are not related to our datetime window") + dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes) + self._dataframe['row_ok'] =\ + self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1) + self._dataframe = self._dataframe[self._dataframe['row_ok']] + self._dataframe = self._dataframe.drop(['row_ok'], axis=1) + logger.info("Rows dropped") def _add_history(self): ''' Integrating previous nb of interventions as features ''' logger.info("Integrating previous nb of interventions as features") - nb_lines = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines') - for k in range(1,nb_lines+1): - name = 'history_'+str(nb_lines-k+1) - self._dataframe[name] = [np.NaN]*k + list(self._dict_target.values())[:-k] + nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines']) + for k in range(1, nb_lines + 1): + name = 'history_' + str(nb_lines - k + 1) + self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k] self._numerical_columns.append(name) self._dataframe = self._dataframe[nb_lines:] - - def _standardize(self): ''' Normalizing numerical features @@ -199,26 +186,26 @@ class Preprocessing: self._dataframe[self._numerical_columns] =\ preprocessing.scale(self._dataframe[self._numerical_columns]) - - def _one_hot_encoding(self): ''' Apply a one hot encoding for category features ''' logger.info("One hot encoding for categorical feature") - # We store numerical columns + df_out = pd.DataFrame() - for col in self._numerical_columns: + for col in self._numerical_columns: + df_out[col] = self._dataframe[col] + # Idem for binary features + for col in self._binary_columns: df_out[col] = self._dataframe[col] # The one hot encoding for col in self._categorical_columns: - pd1 = pd.get_dummies(self._dataframe[col],prefix=col) + pd1 = pd.get_dummies(self._dataframe[col], prefix=col) for col1 in pd1.columns: df_out[col1] = pd1[col1] self._dataframe = df_out - @property def dataframe(self): ''' @@ -232,13 +219,14 @@ class Preprocessing: self._fill_nan() # Adding previous (historical) nb_interventions as features self._add_history() + # self._dataframe.to_csv('toto.csv') + # exit() # Normalizing numerical values self._standardize() # Dealing with categorical features self._one_hot_encoding() return self._dataframe - @dataframe.setter def dataframe(self, df): self._dataframe = df