1 from configparser import ConfigParser
2 from logging import getLogger
3 from logging.config import fileConfig
5 from pathlib import Path
6 from sklearn.metrics import mean_squared_error, mean_absolute_error
7 from sklearn.model_selection import train_test_split
8 from statistics import mean, stdev
10 import lightgbm as lgb
16 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
22 def __init__(self, config_file=None, file_name=None,
23 X=None, y=None, horizon=0):
24 self._config = ConfigParser()
25 self._config.read(config_file)
26 self._file_name = file_name
27 logger.info("Dealing with the horizon of prediction")
28 self._X = X[:-horizon]
34 logger.info("Generation of learning sets")
36 self._df['cible'] = self._y
37 train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42)
38 train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42)
40 self._X_test = test_set.drop('cible', axis=1)
41 self._y_test = test_set['cible'].copy()
43 X_train = train_set.drop('cible', axis=1)
44 y_train = train_set['cible'].copy()
45 X_val = val_set.drop('cible', axis=1)
46 y_val = val_set['cible'].copy()
48 logger.info("Start learning")
49 if self._config['MODEL']['method'] == 'xgboost':
50 logger.info("Using xgboost regressor")
51 self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
52 max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'),
53 random_state=self._config['HYPERPARAMETERS'].getint('random_state'),
54 n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'),
55 n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'),
56 objective='count:poisson')
58 self._reg.fit(X_train, y_train,
59 eval_set=[(X_val, y_val)],
60 early_stopping_rounds=10)
61 elif self._config['MODEL']['method'] == 'lightgbm':
62 train_data = lgb.Dataset(X_train, label=y_train)
63 val_data = lgb.Dataset(X_val, label=y_val)
64 num_round = self._config['HYPERPARAMETERS'].getint('num_round')
66 'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
67 'metric': self._config['HYPERPARAMETERS'].get('metric'),
68 'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'),
69 'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'),
70 'objective': self._config['HYPERPARAMETERS'].get('objective')
72 self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data])
75 logger.info("Evaluation of the learner")
76 y_test_pred = self._reg.predict(self._X_test)
77 txt = f"Average interventions per time unit: {mean(self._df.cible)}\n"
78 txt += f"Standard deviation: {stdev(self._df.cible)}\n\n"
80 txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n"
81 txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n"
84 txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n"
87 rep = (Path.cwd() / self._file_name)
89 filename = str(self._file_name / os.path.basename(self._file_name))
90 with open(filename + ".result", 'w') as f:
93 y_true = self._df[self._df.year == self._df.year.max()].cible
94 x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1)
96 yy_test_pred = self._reg.predict(x_true)
97 P.figure(figsize=(36, 16))
98 P.plot(list(y_true)[:300], color='blue', label='actual')
99 P.plot(yy_test_pred[:300], color='red', label='predicted')
100 P.title('Predictions for 2018')
101 P.xlabel('Hour in the year')
102 P.ylabel('Number of cumulated interventions')
104 P.savefig(filename + ".png")
106 yy_test_pred = self._reg.predict(self._X_test)
107 P.figure(figsize=(36, 16))
108 P.plot(list(self._y_test)[:300], color='blue', label='actual')
109 P.plot(yy_test_pred[:300], color='red', label='predicted')
110 P.title('Predictions for test set')
111 P.xlabel('Hour in the year')
112 P.ylabel('Number of cumulated interventions')
114 P.savefig(filename + "-test.png")
116 if self._config['MODEL']['method'] == 'xgboost':
117 xgboost.plot_importance(self._reg)
118 fig = matplotlib.pyplot.gcf()
119 fig.set_size_inches(15, 130)
120 fig.savefig(filename + '-feat_importance.pdf')