X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/blobdiff_plain/288baa6ff06c1b815ec24d164770acc93ac80499..refs/heads/master:/predictops/learn/learning.py?ds=inline diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py index 4164500..f13c3a6 100644 --- a/predictops/learn/learning.py +++ b/predictops/learn/learning.py @@ -1,45 +1,126 @@ from configparser import ConfigParser +from logging import getLogger +from logging.config import fileConfig from math import sqrt +from pathlib import Path from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.model_selection import train_test_split +from statistics import mean, stdev +import lightgbm as lgb +import matplotlib +import os +import pandas as pd +import pylab as P import xgboost +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + + class Learning: - def __init__(self, config_file = None, - X = None, y = None): + def __init__(self, config_file=None, file_name=None, + X=None, y=None, horizon=0): self._config = ConfigParser() self._config.read(config_file) + self._file_name = file_name + logger.info("Dealing with the horizon of prediction") + if horizon: + self._X = X[:-horizon] + self._y = y[horizon:] + else: + self._X = X + self._y = y + self._learn() + self._evaluate() - df = X - df['cible'] = y - - print(df.head()) - - train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42) - train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42) + def _learn(self): + logger.info("Generation of learning sets") + self._df = self._X + self._df['cible'] = self._y + train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42) + train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42) - X_test = test_set.drop('cible', axis = 1) - y_test = test_set['cible'].copy() + self._X_test = test_set.drop('cible', axis=1) + self._y_test = test_set['cible'].copy() X_train = train_set.drop('cible', axis=1) y_train = train_set['cible'].copy() X_val = val_set.drop('cible', axis=1) y_val = val_set['cible'].copy() + logger.info("Start learning") + if self._config['MODEL']['method'] == 'xgboost': + logger.info("Using xgboost regressor") + self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'), + max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'), + random_state=self._config['HYPERPARAMETERS'].getint('random_state'), + n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'), + n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'), + objective='count:poisson') + + self._reg.fit(X_train, y_train, + eval_set=[(X_val, y_val)], + early_stopping_rounds=10) + elif self._config['MODEL']['method'] == 'lightgbm': + train_data = lgb.Dataset(X_train, label=y_train) + val_data = lgb.Dataset(X_val, label=y_val) + num_round = self._config['HYPERPARAMETERS'].getint('num_round') + param = { + 'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'), + 'metric': self._config['HYPERPARAMETERS'].get('metric'), + 'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'), + 'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'), + 'objective': self._config['HYPERPARAMETERS'].get('objective') + } + self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data]) + + def _evaluate(self): + logger.info("Evaluation of the learner") + y_test_pred = self._reg.predict(self._X_test) + txt = f"Average interventions per time unit: {mean(self._df.cible)}\n" + txt += f"Standard deviation: {stdev(self._df.cible)}\n\n" + + txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n" + txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n" + + for k in range(10): + txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n" + + rep = (Path.cwd() / self._file_name) + rep.mkdir() + self._filename = str(self._file_name / os.path.basename(self._file_name)) + + print(txt) + with open(self._filename + ".result", 'w') as f: + f.write(txt) + + y_true = self._df[self._df.year == self._df.year.max()].cible + x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1) + + yy_test_pred = self._reg.predict(x_true) + P.figure(figsize=(36, 16)) + P.plot(list(y_true)[:300], color='blue', label='actual') + P.plot(yy_test_pred[:300], color='red', label='predicted') + P.title('Predictions for 2018') + P.xlabel('Hour in the year') + P.ylabel('Number of cumulated interventions') + P.legend() + P.savefig(self._filename + ".png") + + yy_test_pred = self._reg.predict(self._X_test) + P.figure(figsize=(36, 16)) + P.plot(list(self._y_test)[:300], color='blue', label='actual') + P.plot(yy_test_pred[:300], color='red', label='predicted') + P.title('Predictions for test set') + P.xlabel('Hour in the year') + P.ylabel('Number of cumulated interventions') + P.legend() + P.savefig(self._filename + "-test.png") if self._config['MODEL']['method'] == 'xgboost': - xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01, - max_depth = 10, - random_state=42, - n_estimators = 173, - n_jobs=-1, - objective = 'count:poisson') - - xgb_reg.fit(X_train, y_train, - eval_set=[(X_val, y_val)], - early_stopping_rounds=10) - - y_test_pred = xgb_reg.predict(X_test) - print(sqrt(mean_squared_error(y_test_pred, y_test)), mean_absolute_error(y_test_pred,y_test)) \ No newline at end of file + xgboost.plot_importance(self._reg) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(15, 130) + fig.savefig(self._filename + '-feat_importance.pdf')