From 83fdad7cdd97734f036d464acceebaf3b0f0a44b Mon Sep 17 00:00:00 2001 From: Christophe Guyeux Date: Sun, 23 Feb 2020 09:39:21 +0100 Subject: [PATCH 1/1] Adding ramadan features, and binary category of feat. --- config/features/feature_ephemeris.cfg | 7 ++ config/features/feature_holidays.cfg | 12 ++- config/features/feature_meteo.cfg | 13 +++ config/features/feature_ramadan.cfg | 14 +++ config/learn.cfg | 19 ++-- config/learners/lightgbm.cfg | 10 ++ config/learners/xgboost.cfg | 2 +- main.py | 13 +-- predictops/engine.py | 132 ++++++++++++++++++-------- predictops/learn/learning.py | 118 +++++++++++++++++++---- predictops/learn/preprocessing.py | 65 ++++++------- predictops/source/holidays.py | 30 ++++-- predictops/source/meteofrance.py | 2 +- predictops/source/ramadan.py | 71 ++++++++++++++ predictops/target/target.py | 68 ++++++------- requirements.txt | 9 ++ 16 files changed, 424 insertions(+), 161 deletions(-) create mode 100644 config/features/feature_ramadan.cfg create mode 100644 config/learners/lightgbm.cfg create mode 100644 predictops/source/ramadan.py diff --git a/config/features/feature_ephemeris.cfg b/config/features/feature_ephemeris.cfg index decc787..3ed31c7 100644 --- a/config/features/feature_ephemeris.cfg +++ b/config/features/feature_ephemeris.cfg @@ -1,27 +1,34 @@ [hour] +binary = False categorical = True numerical = False [dayInWeek] +binary = False categorical = True numerical = False [dayInMonth] +binary = False categorical = True numerical = False [dayInYear] +binary = False categorical = True numerical = False [weekInYear] +binary = False categorical = True numerical = False [month] +binary = False categorical = False numerical = True [year] +binary = False categorical = False numerical = True \ No newline at end of file diff --git a/config/features/feature_holidays.cfg b/config/features/feature_holidays.cfg index c3b3063..ccccbd7 100644 --- a/config/features/feature_holidays.cfg +++ b/config/features/feature_holidays.cfg @@ -2,17 +2,21 @@ name = Besançon [bankHolidays] -categorical = True +binary = True +categorical = False numerical = False [bankHolidaysEve] -categorical = True +binary = True +categorical = False numerical = False [holidays] -categorical = True +binary = True +categorical = False numerical = False [holidaysEve] -categorical = True +binary = True +categorical = False numerical = False diff --git a/config/features/feature_meteo.cfg b/config/features/feature_meteo.cfg index 04f6c62..5b694e5 100644 --- a/config/features/feature_meteo.cfg +++ b/config/features/feature_meteo.cfg @@ -11,65 +11,78 @@ nb_stations = 3 [temperature] abbreviation = t +binary = False categorical = False numerical = True [pressure] abbreviation = pres +binary = False categorical = False numerical = True [pressureVariation] abbreviation = tend +binary = False categorical = False numerical = True [barometricTrend] abbreviation = cod_tend +binary = False categorical = True numerical = False [humidity] abbreviation = u +binary = False categorical = False numerical = True [dewPoint] abbreviation = td +binary = False categorical = False numerical = True [lastHourRainfall] abbreviation = rr1 +binary = False categorical = False numerical = True [last3hHourRainfall] abbreviation = rr3 +binary = False categorical = False numerical = True [meanWindSpeed10min] abbreviation = ff +binary = False categorical = False numerical = True [meanWindDirection10min] abbreviation = dd +binary = False categorical = False numerical = True [gustsOverAPeriod] abbreviation = rafper +binary = False categorical = False numerical = True [horizontalVisibility] abbreviation = vv +binary = False categorical = False numerical = True [currentWeather] abbreviation = ww +binary = False categorical = True numerical = False \ No newline at end of file diff --git a/config/features/feature_ramadan.cfg b/config/features/feature_ramadan.cfg new file mode 100644 index 0000000..b9dc2c8 --- /dev/null +++ b/config/features/feature_ramadan.cfg @@ -0,0 +1,14 @@ +[ramadanEve] +binary = True +categorical = False +numerical = False + +[ramadan] +binary = True +categorical = False +numerical = False + +[ramadanDayAfter] +binary = True +categorical = False +numerical = False \ No newline at end of file diff --git a/config/learn.cfg b/config/learn.cfg index 82c67dd..73379cf 100644 --- a/config/learn.cfg +++ b/config/learn.cfg @@ -1,33 +1,36 @@ [DATETIME] -start = 01/01/2016 00:00:00 -end = 12/31/2018 23:00:00 +start = 01/01/2006 00:00:00 +end = 12/31/2019 23:00:00 hourStep = 3 [FEATURES] -meteofrance = True ephemeris = True holidays = True +meteofrance = True +ramadan = True [FEATURE_CONFIG] -meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg' ephemeris = (Path.cwd() / 'config') / 'features' / 'feature_ephemeris.cfg' holidays = (Path.cwd() / 'config') / 'features' / 'feature_holidays.cfg' +meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg' +ramadan = (Path.cwd() / 'config') / 'features' / 'feature_ramadan.cfg' [PREPROCESSING] -fill_method = spline +fill_method = linear order = 3 [HISTORY_KNOWLEDGE] -nb_lines = 5 +nb_lines = 24//3*7*4 [TARGET] -config = (Path.cwd() / 'config') / 'targets' / 'sdis25.cfg' - +config = (Path.cwd() / 'config') / 'targets' / 'sdis25.cfg' +cumulative = True +horizon = 1 [LEARNER] config = (Path.cwd() / 'config') / 'learners' / 'xgboost.cfg' \ No newline at end of file diff --git a/config/learners/lightgbm.cfg b/config/learners/lightgbm.cfg new file mode 100644 index 0000000..ef062b3 --- /dev/null +++ b/config/learners/lightgbm.cfg @@ -0,0 +1,10 @@ +[MODEL] +method = lightgbm + +[HYPERPARAMETERS] +learning_rate = 0.1 +metric = auc +num_iterations = 100 +num_round = 10 +num_leaves = 31 +objective = poisson diff --git a/config/learners/xgboost.cfg b/config/learners/xgboost.cfg index d099aa4..f9e5329 100644 --- a/config/learners/xgboost.cfg +++ b/config/learners/xgboost.cfg @@ -6,6 +6,6 @@ method = xgboost learning_rate = 0.01 max_depth = 7 random_state = 42 -n_estimators = 1000 +n_estimators = 10000 n_jobs = -1 objective = 'count:poisson' \ No newline at end of file diff --git a/main.py b/main.py index 27f502a..d451534 100644 --- a/main.py +++ b/main.py @@ -11,14 +11,11 @@ logger = getLogger() if __name__ == '__main__': config = (Path.cwd() / 'config') / 'learn.cfg' - engine = Engine(config_file = config) - - engine.add_features() - engine.add_target() - - engine.add_preprocessing() - - engine.learn() + with Engine(config_file = config) as e: + e.add_features() + e.add_target() + e.add_preprocessing() + e.learn() '''target = toarea(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv') diff --git a/predictops/engine.py b/predictops/engine.py index e7bbf1c..a63ef29 100644 --- a/predictops/engine.py +++ b/predictops/engine.py @@ -5,97 +5,150 @@ from logging.config import fileConfig from pathlib import Path from shutil import rmtree +import os + +from .learn.learning import Learning +from .learn.preprocessing import Preprocessing from .source.ephemeris import Ephemeris from .source.holidays import Holidays +from .source.ramadan import Ramadan from .source.meteofrance import MeteoFrance -from .learn.learning import Learning -from .learn.preprocessing import Preprocessing from .target.target import Target fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() -class Engine: +class Engine(object): - def __init__(self, config_file = (Path.cwd() / 'config') / 'learn.cfg'): + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + with open(str(self._file_name / os.path.basename(self._file_name)) + '.cfg', 'w') as f: + f.write(self._config_text) + + def __init__(self, config_file=(Path.cwd() / 'config') / 'learn.cfg'): self._config = ConfigParser() self._config.read(config_file) + launching_time = datetime.strftime(datetime.now(), '%Y_%m_%d_%H_%M') + self._name = os.path.splitext(os.path.basename(eval(self._config['TARGET']['config'])))[0] + self._file_name = f"{self._name}-{launching_time}" + p = Path.cwd() / 'results' / self._name + p.mkdir(exist_ok=True, parents=True) + self._file_name = p / self._file_name + + self._config_text = '' + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + self._start = datetime.strptime(self._config['DATETIME']['start'], '%m/%d/%Y %H:%M:%S') self._end = datetime.strptime(self._config['DATETIME']['end'], - '%m/%d/%Y %H:%M:%S') + '%m/%d/%Y %H:%M:%S') - self._timestep = timedelta(hours = - self._config['DATETIME'].getfloat('hourStep')) + self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep')) self._X = {} - - def clean(self): # Cleaning the data directory logger.info("Cleaning and restoring data directory") - directory = Path.cwd() / 'data' + directory = Path.cwd() / 'data' if directory.is_dir(): rmtree(directory) p = Path(Path.cwd() / 'data') p.mkdir() - def add_features(self): - if self._config['FEATURES'].getboolean('meteofrance'): - meteofeature = MeteoFrance(config_file = - eval(self._config['FEATURE_CONFIG']['meteofrance'])) - - meteofeature.start = self._start - meteofeature.end = self._end - - meteofeature.update() - dated_features = meteofeature.dated_features - for date in dated_features: - self._X.setdefault(date,{}).update(dated_features[date]) - if self._config['FEATURES'].getboolean('ephemeris'): - ephemerides = Ephemeris(config_file = - eval(self._config['FEATURE_CONFIG']['ephemeris'])) + config_file = eval(self._config['FEATURE_CONFIG']['ephemeris']) + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + + ephemerides = Ephemeris(config_file=config_file) ephemerides.start = self._start ephemerides.end = self._end dated_features = ephemerides.dated_features for date in dated_features: - self._X.setdefault(date,{}).update(dated_features[date]) + self._X.setdefault(date, {}).update(dated_features[date]) if self._config['FEATURES'].getboolean('holidays'): - holidays = Holidays(config_file = - eval(self._config['FEATURE_CONFIG']['holidays'])) + config_file = eval(self._config['FEATURE_CONFIG']['holidays']) + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + + holidays = Holidays(config_file=config_file) holidays.start = self._start holidays.end = self._end dated_features = holidays.dated_features for date in dated_features: - self._X.setdefault(date,{}).update(dated_features[date]) + self._X.setdefault(date, {}).update(dated_features[date]) + + if self._config['FEATURES'].getboolean('meteofrance'): + config_file = eval(self._config['FEATURE_CONFIG']['meteofrance']) + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + meteofeature = MeteoFrance(config_file=config_file) + + meteofeature.start = self._start + meteofeature.end = self._end + + meteofeature.update() + dated_features = meteofeature.dated_features + for date in dated_features: + self._X.setdefault(date, {}).update(dated_features[date]) + + if self._config['FEATURES'].getboolean('ramadan'): + config_file = eval(self._config['FEATURE_CONFIG']['ramadan']) + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + + ramadan = Ramadan(config_file=config_file) + + ramadan.start = self._start + ramadan.end = self._end + + dated_features = ramadan.dated_features + for date in dated_features: + self._X.setdefault(date, {}).update(dated_features[date]) def add_target(self): - self._target = Target(config_file = eval(self._config['TARGET']['config']), - start = self._start, end = self._end, - timestep = self._timestep) + config_file = eval(self._config['TARGET']['config']) + cumulative = self._config['TARGET'].getboolean('cumulative') + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + self._target = Target(config_file=config_file, + start=self._start, end=self._end, + timestep=self._timestep, cumulative=cumulative) def add_preprocessing(self): - self._preproc = Preprocessing(config_file = self._config, - dict_features = self.X, - dict_target = self.y) - + self._preproc = Preprocessing(config_file=self._config, + dict_features=self.X, + dict_target=self.y) def learn(self): - history = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines') - self._learner = Learning(config_file = eval(self._config['LEARNER']['config']), - X = self._preproc.dataframe, y = list(self.y.values())[history:]) + config_file = eval(self._config['LEARNER']['config']) + with open(config_file) as f: + self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n" + self._config_text += f.read() + '\n\n' + history = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines']) + self._learner = Learning(config_file=config_file, file_name=self._file_name, + X=self._preproc.dataframe, y=list(self.y.values())[history:], + horizon=self._config['TARGET'].getint('horizon')) @property def X(self): @@ -105,7 +158,6 @@ class Engine: def X(self, x): self._X = x - @property def y(self): return self._target.y diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py index 9a5860a..959271d 100644 --- a/predictops/learn/learning.py +++ b/predictops/learn/learning.py @@ -1,44 +1,120 @@ from configparser import ConfigParser +from logging import getLogger +from logging.config import fileConfig from math import sqrt +from pathlib import Path from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.model_selection import train_test_split +from statistics import mean, stdev +import lightgbm as lgb +import matplotlib +import os +import pylab as P import xgboost +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + + class Learning: - def __init__(self, config_file = None, - X = None, y = None): + def __init__(self, config_file=None, file_name=None, + X=None, y=None, horizon=0): self._config = ConfigParser() self._config.read(config_file) + self._file_name = file_name + logger.info("Dealing with the horizon of prediction") + self._X = X[:-horizon] + self._y = y[horizon:] + self._learn() + self._evaluate() - df = X - df['cible'] = y - - train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42) - train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42) + def _learn(self): + logger.info("Generation of learning sets") + self._df = self._X + self._df['cible'] = self._y + train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42) + train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42) - X_test = test_set.drop('cible', axis = 1) - y_test = test_set['cible'].copy() + self._X_test = test_set.drop('cible', axis=1) + self._y_test = test_set['cible'].copy() X_train = train_set.drop('cible', axis=1) y_train = train_set['cible'].copy() X_val = val_set.drop('cible', axis=1) y_val = val_set['cible'].copy() - + logger.info("Start learning") if self._config['MODEL']['method'] == 'xgboost': + logger.info("Using xgboost regressor") + self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'), + max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'), + random_state=self._config['HYPERPARAMETERS'].getint('random_state'), + n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'), + n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'), + objective='count:poisson') + + self._reg.fit(X_train, y_train, + eval_set=[(X_val, y_val)], + early_stopping_rounds=10) + elif self._config['MODEL']['method'] == 'lightgbm': + train_data = lgb.Dataset(X_train, label=y_train) + val_data = lgb.Dataset(X_val, label=y_val) + num_round = self._config['HYPERPARAMETERS'].getint('num_round') + param = { + 'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'), + 'metric': self._config['HYPERPARAMETERS'].get('metric'), + 'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'), + 'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'), + 'objective': self._config['HYPERPARAMETERS'].get('objective') + } + self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data]) + + def _evaluate(self): + logger.info("Evaluation of the learner") + y_test_pred = self._reg.predict(self._X_test) + txt = f"Average interventions per time unit: {mean(self._df.cible)}\n" + txt += f"Standard deviation: {stdev(self._df.cible)}\n\n" - xgb_reg = xgboost.XGBRegressor(learning_rate = self._config['HYPERPARAMETERS'].getfloat('learning_rate'), - max_depth = self._config['HYPERPARAMETERS'].getint('max_depth'), - random_state = self._config['HYPERPARAMETERS'].getint('random_state'), - n_estimators = self._config['HYPERPARAMETERS'].getint('n_estimators'), - n_jobs = self._config['HYPERPARAMETERS'].getint('n_jobs'), - objective = 'count:poisson') + txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n" + txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n" - xgb_reg.fit(X_train, y_train, - eval_set=[(X_val, y_val)], - early_stopping_rounds=10) + for k in range(10): + txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n" - y_test_pred = xgb_reg.predict(X_test) - print(sqrt(mean_squared_error(y_test_pred, y_test)), mean_absolute_error(y_test_pred,y_test)) \ No newline at end of file + print(txt) + rep = (Path.cwd() / self._file_name) + rep.mkdir() + filename = str(self._file_name / os.path.basename(self._file_name)) + with open(filename + ".result", 'w') as f: + f.write(txt) + + y_true = self._df[self._df.year == self._df.year.max()].cible + x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1) + + yy_test_pred = self._reg.predict(x_true) + P.figure(figsize=(36, 16)) + P.plot(list(y_true)[:300], color='blue', label='actual') + P.plot(yy_test_pred[:300], color='red', label='predicted') + P.title('Predictions for 2018') + P.xlabel('Hour in the year') + P.ylabel('Number of cumulated interventions') + P.legend() + P.savefig(filename + ".png") + + yy_test_pred = self._reg.predict(self._X_test) + P.figure(figsize=(36, 16)) + P.plot(list(self._y_test)[:300], color='blue', label='actual') + P.plot(yy_test_pred[:300], color='red', label='predicted') + P.title('Predictions for test set') + P.xlabel('Hour in the year') + P.ylabel('Number of cumulated interventions') + P.legend() + P.savefig(filename + "-test.png") + + if self._config['MODEL']['method'] == 'xgboost': + xgboost.plot_importance(self._reg) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(15, 130) + fig.savefig(filename + '-feat_importance.pdf') diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index 885aad3..9bc09ad 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -14,6 +14,7 @@ import pandas as pd fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() + class Preprocessing: ''' Generate a pandas dataframe from a dictionary of features per datetime, which @@ -25,8 +26,8 @@ class Preprocessing: - NaN values are then filled with last known values. ''' - def __init__(self, config_file = None, - dict_features = None, dict_target = None): + def __init__(self, config_file=None, + dict_features=None, dict_target=None): ''' Constructor that defines all needed attributes and collects features. ''' @@ -35,9 +36,8 @@ class Preprocessing: self._start = datetime.strptime(self._config['DATETIME']['start'], '%m/%d/%Y %H:%M:%S') self._end = datetime.strptime(self._config['DATETIME']['end'], - '%m/%d/%Y %H:%M:%S') - self._timestep = timedelta(hours = - self._config['DATETIME'].getfloat('hourStep')) + '%m/%d/%Y %H:%M:%S') + self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep')) self._dict_features = dict_features self._dict_target = dict_target @@ -46,26 +46,28 @@ class Preprocessing: self._datetimes = [] self._features = set(chain.from_iterable([tuple(u.keys()) - for u in [*dict_features.values()]])) + for u in [*dict_features.values()]])) #feature_files = Path.cwd() / 'config' / 'features' - self._features = {feat : {'numerical': False, 'categorical': False} + self._features = {feat: {'numerical': False, 'categorical': False} for feat in self._features} for feature in self._config['FEATURES']: if self._config['FEATURES'][feature]: feature_file = self._config['FEATURE_CONFIG'][feature] config = ConfigParser() - config.read(feature_file) + config.read(eval(feature_file)) for section in config: if config.has_option(section, 'numerical'): - self._features[section]['numerical'] = config[section].getboolean('numerical') - self._features[section]['categorical'] = config[section].getboolean('categorical') + for feature in self._features: + if feature.split('_')[0] == section: + self._features[feature]['binary'] = config[section].getboolean('binary') + self._features[feature]['categorical'] = config[section].getboolean('categorical') + self._features[feature]['numerical'] = config[section].getboolean('numerical') - self._numerical_columns = [k for k in self._features if self._features[k]['numerical']] + self._binary_columns = [k for k in self._features if self._features[k]['binary']] self._categorical_columns = [k for k in self._features if self._features[k]['categorical']] - - + self._numerical_columns = [k for k in self._features if self._features[k]['numerical']] @property def start(self): @@ -75,7 +77,6 @@ class Preprocessing: def start(self, x): self._start = x - @property def end(self): return self._end @@ -84,7 +85,6 @@ class Preprocessing: def end(self, x): self._end = x - @property def timestep(self): return self._timestep @@ -93,7 +93,6 @@ class Preprocessing: def timestep(self, x): self._timestep = x - def _fill_dict(self): ''' Add datetime keys in the dated feature dictionary that are missing. The @@ -105,16 +104,16 @@ class Preprocessing: while current <= self._end: self._datetimes.append(current) if current not in self._dict_features: - self._dict_features[current] = {feature:np.NaN + self._dict_features[current] = {feature: np.NaN for feature in self._features} else: - null_dict = {feature:np.NaN + null_dict = {feature: np.NaN for feature in self._features} null_dict.update(self._dict_features[current]) self._dict_features[current] = null_dict current += self._timestep for k in self._dict_features: - null_dict = {feature:np.NaN + null_dict = {feature: np.NaN for feature in self._features} null_dict.update(self._dict_features[k]) self._dict_features[k] = null_dict @@ -122,8 +121,6 @@ class Preprocessing: self._full_dict = {k: self._dict_features[k] for k in sorted(self._dict_features.keys())} - - @property def full_dict(self): ''' @@ -133,7 +130,6 @@ class Preprocessing: self._fill_dict() return self._full_dict - def _fill_nan(self): ''' Fill NaN values, either by propagation or by interpolation (linear or splines) @@ -150,7 +146,7 @@ class Preprocessing: elif self._config['PREPROCESSING']['fill_method'] == 'spline': self._dataframe[self._numerical_columns] =\ self._dataframe[self._numerical_columns].interpolate(method='spline', - order=self._config['PREPROCESSING'].getint('order')) + order=self._config['PREPROCESSING'].getint('order')) # For the categorical columns, NaN values are filled by duplicating # the last known value (forward fill method) @@ -173,21 +169,18 @@ class Preprocessing: self._dataframe = self._dataframe.drop(['row_ok'], axis=1) logger.info("Rows dropped") - def _add_history(self): ''' Integrating previous nb of interventions as features ''' logger.info("Integrating previous nb of interventions as features") - nb_lines = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines') - for k in range(1,nb_lines+1): - name = 'history_'+str(nb_lines-k+1) - self._dataframe[name] = [np.NaN]*k + list(self._dict_target.values())[:-k] + nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines']) + for k in range(1, nb_lines + 1): + name = 'history_' + str(nb_lines - k + 1) + self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k] self._numerical_columns.append(name) self._dataframe = self._dataframe[nb_lines:] - - def _standardize(self): ''' Normalizing numerical features @@ -197,26 +190,25 @@ class Preprocessing: self._dataframe[self._numerical_columns] =\ preprocessing.scale(self._dataframe[self._numerical_columns]) - - def _one_hot_encoding(self): ''' Apply a one hot encoding for category features ''' logger.info("One hot encoding for categorical feature") - # We store numerical columns df_out = pd.DataFrame() - for col in self._numerical_columns: + for col in self._numerical_columns: + df_out[col] = self._dataframe[col] + # Idem for binary features + for col in self._binary_columns: df_out[col] = self._dataframe[col] # The one hot encoding for col in self._categorical_columns: - pd1 = pd.get_dummies(self._dataframe[col],prefix=col) + pd1 = pd.get_dummies(self._dataframe[col], prefix=col) for col1 in pd1.columns: df_out[col1] = pd1[col1] self._dataframe = df_out - @property def dataframe(self): ''' @@ -236,7 +228,6 @@ class Preprocessing: self._one_hot_encoding() return self._dataframe - @dataframe.setter def dataframe(self, df): self._dataframe = df diff --git a/predictops/source/holidays.py b/predictops/source/holidays.py index 1a536fe..6893db0 100644 --- a/predictops/source/holidays.py +++ b/predictops/source/holidays.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta from jours_feries_france.compute import JoursFeries from logging import getLogger from logging.config import fileConfig +from pathlib import Path from vacances_scolaires_france import SchoolHolidayDates import itertools @@ -90,17 +91,28 @@ class Holidays: bankHolidaysEve = tuple(u-timedelta(days=1) for u in bankHolidays) name = self._config['ZONE']['name'] date = self._start + Date = datetime.date(date) + tomorrow = date + timedelta(days=1) + Tomorrow = datetime.date(tomorrow) d = SchoolHolidayDates() + dict_hour = { + 'bankHolidays' : Date in bankHolidays, + 'bankHolidaysEve': Date in bankHolidaysEve, + 'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)), + 'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow)) + } while date <= self._end: - Date = datetime.date(date) - tomorrow = date + timedelta(days=1) - Tomorrow = datetime.date(tomorrow) - dict_hour = { - 'bankHolidays' : Date in bankHolidays, - 'bankHolidaysEve': Date in bankHolidaysEve, - 'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)), - 'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow)) - } self._dated_features[date] = dict_hour + current = date date += timedelta(hours=1) + if date.day != current.day: + Date = datetime.date(date) + tomorrow = date + timedelta(days=1) + Tomorrow = datetime.date(tomorrow) + dict_hour = { + 'bankHolidays' : Date in bankHolidays, + 'bankHolidaysEve': Date in bankHolidaysEve, + 'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)), + 'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow)) + } return self._dated_features \ No newline at end of file diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py index ff6a238..0edd49f 100644 --- a/predictops/source/meteofrance.py +++ b/predictops/source/meteofrance.py @@ -247,7 +247,7 @@ class MeteoFrance: if (date >= self._start and date <= self._end)\ or (date.year == self._start.year and date.month == self._start.month)\ or (date.year == self._end.year and date.month == self._end.month): - logger.info(f'Inserting {csv_meteo} in intervention dictionary') + logger.info(f'Adding meteofrance features from {csv_meteo}') with open(dir_data / csv_meteo, "r") as f: reader = DictReader(f, delimiter=';') for row in reader: diff --git a/predictops/source/ramadan.py b/predictops/source/ramadan.py new file mode 100644 index 0000000..6836df1 --- /dev/null +++ b/predictops/source/ramadan.py @@ -0,0 +1,71 @@ +from configparser import ConfigParser +from convertdate import islamic +from datetime import datetime, timedelta +from logging import getLogger +from logging.config import fileConfig +from pathlib import Path + + +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + + +class Ramadan: + + _start = None + _end = None + + def __init__(self, config_file): + + self._config = ConfigParser() + self._config.read(config_file) + + # Collecting holidays features + self._features = [section for section in self._config + if self._config[section].getboolean('numerical') + or self._config[section].getboolean('categorical')] + + self._dated_features = {} + + @property + def start(self): + return self._start + + @start.setter + def start(self, x): + self._start = x + + @property + def end(self): + return self._end + + @end.setter + def end(self, x): + self._end = x + + @property + def dated_features(self): + if self._dated_features == {}: + logger.info("Adding Ramadan features") + date = self._start + while date <= self._end: + year, month, day = date.year, date.month, date.day + eve = datetime(year, month, day) - timedelta(days=1) + tomorrow = datetime(year, month, day) + timedelta(days=1) + Hegirian_month = islamic.from_gregorian(year, month, day)[1] + dict_hour = { + 'ramadanEve': False, + 'ramadan': False, + 'ramadanDayAfter': False + } + if Hegirian_month == 8 and\ + islamic.from_gregorian(tomorrow.year, tomorrow.month, tomorrow.day)[1] == 9: + dict_hour['ramadanEve'] = True + elif Hegirian_month == 9: + dict_hour['ramadan'] = True + elif Hegirian_month == 10 and\ + islamic.from_gregorian(eve.year, eve.month, eve.day)[1] == 9: + dict_hour['ramadanDayAfter'] = True + self._dated_features[date] = dict_hour + date += timedelta(hours=1) + return self._dated_features diff --git a/predictops/target/target.py b/predictops/target/target.py index b998120..9e3d86d 100644 --- a/predictops/target/target.py +++ b/predictops/target/target.py @@ -12,7 +12,7 @@ logger = getLogger() class Target: def __init__(self, config_file = None, - start = None, end = None, timestep = None): + start = None, end = None, timestep = None, cumulative = None): self._config = ConfigParser() self._config.read(config_file) @@ -20,6 +20,7 @@ class Target: self._start = start self._end = end self._timestep = timestep + self._cumulative = cumulative logger.info('Initialization of target variable') self._y = {} @@ -68,36 +69,39 @@ class Target: logger.info('Integrating interventions for the whole area') with open(self._stream_file) as f: reader = DictReader(f, delimiter=',') - for row in reader: - if row['start'] != '': - start_interv = datetime.strptime(row['start'], '%d/%m/%Y %H:%M:%S') - start_interv = start_interv.replace(minute=0) - end_interv = datetime.strptime(row['end'], '%d/%m/%Y %H:%M:%S') - end_interv = end_interv.replace(minute=0) - if not (start_interv > self._end or end_interv < self._start): - if start_interv < self._start and end_interv <= self._end: - current = self._start - while current <= end_interv: - self._y[current] += 1 - current += self._timestep - elif start_interv >= self._start and end_interv > self._end: - current = start_interv - while current not in self._y: - current -= timedelta(hours=1) - while current <= self._end: - self._y[current] += 1 - current += self._timestep - elif start_interv >= self._start and end_interv <= self._end: - current = start_interv - while current not in self._y: - current -= timedelta(hours=1) - while current <= end_interv: - self._y[current] += 1 - current += self._timestep - - - - - + if self._cumulative: + for row in reader: + if row['start'] != '': + start_interv = datetime.strptime(row['start'], '%d/%m/%Y %H:%M:%S') + start_interv = start_interv.replace(minute=0) + end_interv = datetime.strptime(row['end'], '%d/%m/%Y %H:%M:%S') + end_interv = end_interv.replace(minute=0) + if not (start_interv > self._end or end_interv < self._start): + if start_interv < self._start and end_interv <= self._end: + current = self._start + while current <= end_interv: + self._y[current] += 1 + current += self._timestep + elif start_interv >= self._start and end_interv > self._end: + current = start_interv + while current not in self._y: + current -= timedelta(hours=1) + while current <= self._end: + self._y[current] += 1 + current += self._timestep + elif start_interv >= self._start and end_interv <= self._end: + current = start_interv + while current not in self._y: + current -= timedelta(hours=1) + while current <= end_interv: + self._y[current] += 1 + current += self._timestep + else: + for row in reader: + if row['start'] != '': + start_interv = datetime.strptime(row['start'], '%d/%m/%Y %H:%M:%S') + start_interv = start_interv.replace(minute=0) + if start_interv in self._y: + self._y[start_interv] += 1 diff --git a/requirements.txt b/requirements.txt index 3e40e81..6615f1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,14 +2,22 @@ attrs==19.3.0 Click==7.0 click-plugins==1.1.1 cligj==0.5.0 +convertdate==2.2.0 +cycler==0.10.0 Fiona==1.8.13 geographiclib==1.50 geopandas==0.6.3 geopy==1.21.0 joblib==0.14.1 +jours-feries-france==0.5.1 +kiwisolver==1.1.0 +lightgbm==2.3.1 +matplotlib==3.1.3 munch==2.5.0 numpy==1.18.1 pandas==1.0.1 +PyMeeus==0.3.6 +pyparsing==2.4.6 pyproj==2.4.2.post1 python-dateutil==2.8.1 pytz==2019.3 @@ -17,5 +25,6 @@ scikit-learn==0.22.1 scipy==1.4.1 Shapely==1.7.0 six==1.14.0 +vacances-scolaires-france==0.7.0 xgboost==0.90 xlrd==1.2.0 -- 2.39.5