lightgbm is now working

[predictops.git] / predictops / learn / learning.py
diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py

index 416450047e8cddda860da6be9296e11c17e0004c..fb3675d56d7e007c557d93aeedd64bcc514a8df0 100644 (file)
--- a/predictops/learn/learning.py
+++ b/predictops/learn/learning.py
@@ -1,45 +1,126 @@
  from configparser import ConfigParser
  from configparser import ConfigParser
+from logging import getLogger
+from logging.config import fileConfig
  from math import sqrt
  from math import sqrt
+from pathlib import Path
  from sklearn.metrics import mean_squared_error, mean_absolute_error
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import mean_squared_error, mean_absolute_error
  from sklearn.model_selection import train_test_split
+from statistics import mean, stdev
  
  
+import lightgbm as lgb
+import matplotlib
+import os
+import pandas as pd
+import pylab as P
  import xgboost
  
  import xgboost
  
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
  class Learning:
  
  class Learning:
  
-    def __init__(self, config_file = None,
-                 X = None, y = None):
+    def __init__(self, config_file=None, file_name=None,
+                 X=None, y=None, horizon=0):
          self._config = ConfigParser()
          self._config.read(config_file)
          self._config = ConfigParser()
          self._config.read(config_file)
+        self._file_name = file_name
+        logger.info("Dealing with the horizon of prediction")
+        if horizon:
+            self._X = X[:-horizon]
+            self._y = y[horizon:]
+        else:
+            self._X = X
+            self._y = y
+        rep = (Path.cwd() / self._file_name)
+        rep.mkdir()
+        self._filename = str(self._file_name / os.path.basename(self._file_name))
+        self._X.to_csv(self._filename + '.csv')
+        self._learn()
+        self._evaluate()
  
  
-        df = X
-        df['cible'] = y
-
-        print(df.head())
-
-        train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)
-        train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42)
+    def _learn(self):
+        logger.info("Generation of learning sets")
+        self._df = self._X
+        self._df['cible'] = self._y
+        train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42)
+        train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42)
  
  
-        X_test = test_set.drop('cible', axis = 1)
-        y_test = test_set['cible'].copy()
+        self._X_test = test_set.drop('cible', axis=1)
+        self._y_test = test_set['cible'].copy()
  
          X_train = train_set.drop('cible', axis=1)
          y_train = train_set['cible'].copy()
          X_val = val_set.drop('cible', axis=1)
          y_val = val_set['cible'].copy()
  
  
          X_train = train_set.drop('cible', axis=1)
          y_train = train_set['cible'].copy()
          X_val = val_set.drop('cible', axis=1)
          y_val = val_set['cible'].copy()
  
+        logger.info("Start learning")
+        if self._config['MODEL']['method'] == 'xgboost':
+            logger.info("Using xgboost regressor")
+            self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
+                                             max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'),
+                                             random_state=self._config['HYPERPARAMETERS'].getint('random_state'),
+                                             n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'),
+                                             n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'),
+                                             objective='count:poisson')
+
+            self._reg.fit(X_train, y_train,
+                          eval_set=[(X_val, y_val)],
+                          early_stopping_rounds=10)
+        elif self._config['MODEL']['method'] == 'lightgbm':
+            train_data = lgb.Dataset(X_train, label=y_train)
+            val_data = lgb.Dataset(X_val, label=y_val)
+            num_round = self._config['HYPERPARAMETERS'].getint('num_round')
+            param = {
+                'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
+                'metric': self._config['HYPERPARAMETERS'].get('metric'),
+                'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'),
+                'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'),
+                'objective': self._config['HYPERPARAMETERS'].get('objective')
+            }
+            self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data])
+
+    def _evaluate(self):
+        logger.info("Evaluation of the learner")
+        y_test_pred = self._reg.predict(self._X_test)
+        txt = f"Average interventions per time unit: {mean(self._df.cible)}\n"
+        txt += f"Standard deviation: {stdev(self._df.cible)}\n\n"
+
+        txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n"
+        txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n"
+
+        for k in range(10):
+            txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n"
+
+        print(txt)
+        with open(self._filename + ".result", 'w') as f:
+            f.write(txt)
+
+        y_true = self._df[self._df.year == self._df.year.max()].cible
+        x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1)
+
+        yy_test_pred = self._reg.predict(x_true)
+        P.figure(figsize=(36, 16))
+        P.plot(list(y_true)[:300], color='blue', label='actual')
+        P.plot(yy_test_pred[:300], color='red', label='predicted')
+        P.title('Predictions for 2018')
+        P.xlabel('Hour in the year')
+        P.ylabel('Number of cumulated interventions')
+        P.legend()
+        P.savefig(self._filename + ".png")
+
+        yy_test_pred = self._reg.predict(self._X_test)
+        P.figure(figsize=(36, 16))
+        P.plot(list(self._y_test)[:300], color='blue', label='actual')
+        P.plot(yy_test_pred[:300], color='red', label='predicted')
+        P.title('Predictions for test set')
+        P.xlabel('Hour in the year')
+        P.ylabel('Number of cumulated interventions')
+        P.legend()
+        P.savefig(self._filename + "-test.png")
  
          if self._config['MODEL']['method'] == 'xgboost':
  
          if self._config['MODEL']['method'] == 'xgboost':
-            xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01,
-                                                   max_depth = 10,
-                                                   random_state=42,
-                                                   n_estimators = 173,
-                                                   n_jobs=-1,
-                                                   objective = 'count:poisson')
-
-            xgb_reg.fit(X_train, y_train,
-                        eval_set=[(X_val, y_val)],
-                        early_stopping_rounds=10)
-
-            y_test_pred = xgb_reg.predict(X_test)
-            print(sqrt(mean_squared_error(y_test_pred, y_test)), mean_absolute_error(y_test_pred,y_test))
-\ No newline at end of file
+            xgboost.plot_importance(self._reg)
+            fig = matplotlib.pyplot.gcf()
+            fig.set_size_inches(15, 130)
+            fig.savefig(self._filename + '-feat_importance.pdf')