]> AND Private Git Repository - predictops.git/blob - predictops/learn/learning.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
fb3675d56d7e007c557d93aeedd64bcc514a8df0
[predictops.git] / predictops / learn / learning.py
1 from configparser import ConfigParser
2 from logging import getLogger
3 from logging.config import fileConfig
4 from math import sqrt
5 from pathlib import Path
6 from sklearn.metrics import mean_squared_error, mean_absolute_error
7 from sklearn.model_selection import train_test_split
8 from statistics import mean, stdev
9
10 import lightgbm as lgb
11 import matplotlib
12 import os
13 import pandas as pd
14 import pylab as P
15 import xgboost
16
17 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
18 logger = getLogger()
19
20
21 class Learning:
22
23     def __init__(self, config_file=None, file_name=None,
24                  X=None, y=None, horizon=0):
25         self._config = ConfigParser()
26         self._config.read(config_file)
27         self._file_name = file_name
28         logger.info("Dealing with the horizon of prediction")
29         if horizon:
30             self._X = X[:-horizon]
31             self._y = y[horizon:]
32         else:
33             self._X = X
34             self._y = y
35         rep = (Path.cwd() / self._file_name)
36         rep.mkdir()
37         self._filename = str(self._file_name / os.path.basename(self._file_name))
38         self._X.to_csv(self._filename + '.csv')
39         self._learn()
40         self._evaluate()
41
42     def _learn(self):
43         logger.info("Generation of learning sets")
44         self._df = self._X
45         self._df['cible'] = self._y
46         train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42)
47         train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42)
48
49         self._X_test = test_set.drop('cible', axis=1)
50         self._y_test = test_set['cible'].copy()
51
52         X_train = train_set.drop('cible', axis=1)
53         y_train = train_set['cible'].copy()
54         X_val = val_set.drop('cible', axis=1)
55         y_val = val_set['cible'].copy()
56
57         logger.info("Start learning")
58         if self._config['MODEL']['method'] == 'xgboost':
59             logger.info("Using xgboost regressor")
60             self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
61                                              max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'),
62                                              random_state=self._config['HYPERPARAMETERS'].getint('random_state'),
63                                              n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'),
64                                              n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'),
65                                              objective='count:poisson')
66
67             self._reg.fit(X_train, y_train,
68                           eval_set=[(X_val, y_val)],
69                           early_stopping_rounds=10)
70         elif self._config['MODEL']['method'] == 'lightgbm':
71             train_data = lgb.Dataset(X_train, label=y_train)
72             val_data = lgb.Dataset(X_val, label=y_val)
73             num_round = self._config['HYPERPARAMETERS'].getint('num_round')
74             param = {
75                 'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
76                 'metric': self._config['HYPERPARAMETERS'].get('metric'),
77                 'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'),
78                 'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'),
79                 'objective': self._config['HYPERPARAMETERS'].get('objective')
80             }
81             self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data])
82
83     def _evaluate(self):
84         logger.info("Evaluation of the learner")
85         y_test_pred = self._reg.predict(self._X_test)
86         txt = f"Average interventions per time unit: {mean(self._df.cible)}\n"
87         txt += f"Standard deviation: {stdev(self._df.cible)}\n\n"
88
89         txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n"
90         txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n"
91
92         for k in range(10):
93             txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n"
94
95         print(txt)
96         with open(self._filename + ".result", 'w') as f:
97             f.write(txt)
98
99         y_true = self._df[self._df.year == self._df.year.max()].cible
100         x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1)
101
102         yy_test_pred = self._reg.predict(x_true)
103         P.figure(figsize=(36, 16))
104         P.plot(list(y_true)[:300], color='blue', label='actual')
105         P.plot(yy_test_pred[:300], color='red', label='predicted')
106         P.title('Predictions for 2018')
107         P.xlabel('Hour in the year')
108         P.ylabel('Number of cumulated interventions')
109         P.legend()
110         P.savefig(self._filename + ".png")
111
112         yy_test_pred = self._reg.predict(self._X_test)
113         P.figure(figsize=(36, 16))
114         P.plot(list(self._y_test)[:300], color='blue', label='actual')
115         P.plot(yy_test_pred[:300], color='red', label='predicted')
116         P.title('Predictions for test set')
117         P.xlabel('Hour in the year')
118         P.ylabel('Number of cumulated interventions')
119         P.legend()
120         P.savefig(self._filename + "-test.png")
121
122         if self._config['MODEL']['method'] == 'xgboost':
123             xgboost.plot_importance(self._reg)
124             fig = matplotlib.pyplot.gcf()
125             fig.set_size_inches(15, 130)
126             fig.savefig(self._filename + '-feat_importance.pdf')