predictops/learn/learning.py

   1 from configparser import ConfigParser
   2 from logging import getLogger
   3 from logging.config import fileConfig
   4 from math import sqrt
   5 from pathlib import Path
   6 from sklearn.metrics import mean_squared_error, mean_absolute_error
   7 from sklearn.model_selection import train_test_split
   8 from statistics import mean, stdev
   9
  10 import lightgbm as lgb
  11 import matplotlib
  12 import os
  13 import pylab as P
  14 import xgboost
  15
  16 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  17 logger = getLogger()
  18
  19
  20 class Learning:
  21
  22     def __init__(self, config_file=None, file_name=None,
  23                  X=None, y=None, horizon=0):
  24         self._config = ConfigParser()
  25         self._config.read(config_file)
  26         self._file_name = file_name
  27         logger.info("Dealing with the horizon of prediction")
  28         self._X = X[:-horizon]
  29         self._y = y[horizon:]
  30         self._learn()
  31         self._evaluate()
  32
  33     def _learn(self):
  34         logger.info("Generation of learning sets")
  35         self._df = self._X
  36         self._df['cible'] = self._y
  37         train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42)
  38         train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42)
  39
  40         self._X_test = test_set.drop('cible', axis=1)
  41         self._y_test = test_set['cible'].copy()
  42
  43         X_train = train_set.drop('cible', axis=1)
  44         y_train = train_set['cible'].copy()
  45         X_val = val_set.drop('cible', axis=1)
  46         y_val = val_set['cible'].copy()
  47
  48         logger.info("Start learning")
  49         if self._config['MODEL']['method'] == 'xgboost':
  50             logger.info("Using xgboost regressor")
  51             self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
  52                                              max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'),
  53                                              random_state=self._config['HYPERPARAMETERS'].getint('random_state'),
  54                                              n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'),
  55                                              n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'),
  56                                              objective='count:poisson')
  57
  58             self._reg.fit(X_train, y_train,
  59                           eval_set=[(X_val, y_val)],
  60                           early_stopping_rounds=10)
  61         elif self._config['MODEL']['method'] == 'lightgbm':
  62             train_data = lgb.Dataset(X_train, label=y_train)
  63             val_data = lgb.Dataset(X_val, label=y_val)
  64             num_round = self._config['HYPERPARAMETERS'].getint('num_round')
  65             param = {
  66                 'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
  67                 'metric': self._config['HYPERPARAMETERS'].get('metric'),
  68                 'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'),
  69                 'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'),
  70                 'objective': self._config['HYPERPARAMETERS'].get('objective')
  71             }
  72             self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data])
  73
  74     def _evaluate(self):
  75         logger.info("Evaluation of the learner")
  76         y_test_pred = self._reg.predict(self._X_test)
  77         txt = f"Average interventions per time unit: {mean(self._df.cible)}\n"
  78         txt += f"Standard deviation: {stdev(self._df.cible)}\n\n"
  79
  80         txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n"
  81         txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n"
  82
  83         for k in range(10):
  84             txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n"
  85
  86         print(txt)
  87         rep = (Path.cwd() / self._file_name)
  88         rep.mkdir()
  89         filename = str(self._file_name / os.path.basename(self._file_name))
  90         with open(filename + ".result", 'w') as f:
  91             f.write(txt)
  92
  93         y_true = self._df[self._df.year == self._df.year.max()].cible
  94         x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1)
  95
  96         yy_test_pred = self._reg.predict(x_true)
  97         P.figure(figsize=(36, 16))
  98         P.plot(list(y_true)[:300], color='blue', label='actual')
  99         P.plot(yy_test_pred[:300], color='red', label='predicted')
 100         P.title('Predictions for 2018')
 101         P.xlabel('Hour in the year')
 102         P.ylabel('Number of cumulated interventions')
 103         P.legend()
 104         P.savefig(filename + ".png")
 105
 106         yy_test_pred = self._reg.predict(self._X_test)
 107         P.figure(figsize=(36, 16))
 108         P.plot(list(self._y_test)[:300], color='blue', label='actual')
 109         P.plot(yy_test_pred[:300], color='red', label='predicted')
 110         P.title('Predictions for test set')
 111         P.xlabel('Hour in the year')
 112         P.ylabel('Number of cumulated interventions')
 113         P.legend()
 114         P.savefig(filename + "-test.png")
 115
 116         if self._config['MODEL']['method'] == 'xgboost':
 117             xgboost.plot_importance(self._reg)
 118             fig = matplotlib.pyplot.gcf()
 119             fig.set_size_inches(15, 130)
 120             fig.savefig(filename + '-feat_importance.pdf')