Adding ramadan features, and binary category of feat.

author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Sun, 23 Feb 2020 08:39:21 +0000 (09:39 +0100)

committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Sun, 23 Feb 2020 08:39:21 +0000 (09:39 +0100)
author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sun, 23 Feb 2020 08:39:21 +0000 (09:39 +0100)
committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sun, 23 Feb 2020 08:39:21 +0000 (09:39 +0100)
diff --git a/config/features/feature_ephemeris.cfg b/config/features/feature_ephemeris.cfg

index decc7871efc82357e6b9469d787f07c25fbd5474..3ed31c76c6b3e5fa2124886b3adc5e0233d2841b 100644 (file)
--- a/config/features/feature_ephemeris.cfg
+++ b/config/features/feature_ephemeris.cfg
@@ -1,27 +1,34 @@
  [hour]
+binary      = False
  categorical = True
  numerical   = False
  
  [dayInWeek]
+binary      = False
  categorical = True
  numerical   = False
  
  [dayInMonth]
+binary      = False
  categorical = True
  numerical   = False
  
  [dayInYear]
+binary      = False
  categorical = True
  numerical   = False
  
  [weekInYear]
+binary      = False
  categorical = True
  numerical   = False
  
  [month]
+binary      = False
  categorical = False
  numerical   = True
  
  [year]
+binary      = False
  categorical = False
  numerical   = True
 \ No newline at end of file
diff --git a/config/features/feature_holidays.cfg b/config/features/feature_holidays.cfg

index c3b3063c99e6ab7283423965f8626f007c88ff10..ccccbd729182ac9c3f0ebf6c68ebbf44a091dc45 100644 (file)
--- a/config/features/feature_holidays.cfg
+++ b/config/features/feature_holidays.cfg
@@ -2,17 +2,21 @@
  name = Besançon
  
  [bankHolidays]
-categorical = True
+binary      = True
+categorical = False
  numerical   = False
  
  [bankHolidaysEve]
-categorical = True
+binary      = True
+categorical = False
  numerical   = False
  
  [holidays]
-categorical = True
+binary      = True
+categorical = False
  numerical   = False
  
  [holidaysEve]
-categorical = True
+binary      = True
+categorical = False
  numerical   = False
diff --git a/config/features/feature_meteo.cfg b/config/features/feature_meteo.cfg

index 04f6c620eb65d7f621c5fd22f90588d6be4beee4..5b694e5f021ad85d7da6c012c885fef1af514a13 100644 (file)
--- a/config/features/feature_meteo.cfg
+++ b/config/features/feature_meteo.cfg
@@ -11,65 +11,78 @@ nb_stations = 3
  
  [temperature]
  abbreviation = t
+binary       = False
  categorical  = False
  numerical    = True
  
  [pressure]
  abbreviation = pres
+binary       = False
  categorical  = False
  numerical    = True
  
  [pressureVariation]
  abbreviation = tend
+binary       = False
  categorical  = False
  numerical    = True
  
  [barometricTrend]
  abbreviation = cod_tend
+binary       = False
  categorical  = True
  numerical    = False
  
  [humidity]
  abbreviation = u
+binary       = False
  categorical  = False
  numerical    = True
  
  [dewPoint]
  abbreviation = td
+binary       = False
  categorical  = False
  numerical    = True
  
  [lastHourRainfall]
  abbreviation = rr1
+binary       = False
  categorical  = False
  numerical    = True
  
  [last3hHourRainfall]
  abbreviation = rr3
+binary       = False
  categorical  = False
  numerical    = True
  
  [meanWindSpeed10min]
  abbreviation = ff
+binary       = False
  categorical  = False
  numerical    = True
  
  [meanWindDirection10min]
  abbreviation = dd
+binary       = False
  categorical  = False
  numerical    = True
  
  [gustsOverAPeriod]
  abbreviation = rafper
+binary       = False
  categorical  = False
  numerical    = True
  
  [horizontalVisibility]
  abbreviation = vv
+binary       = False
  categorical  = False
  numerical    = True
  
  [currentWeather]
  abbreviation = ww
+binary       = False
  categorical  = True
  numerical    = False
 \ No newline at end of file
diff --git a/config/features/feature_ramadan.cfg b/config/features/feature_ramadan.cfg

new file mode 100644 (file)

index 0000000..b9dc2c8
--- /dev/null
+++ b/config/features/feature_ramadan.cfg
@@ -0,0 +1,14 @@
+[ramadanEve]
+binary      = True
+categorical = False
+numerical   = False
+
+[ramadan]
+binary      = True
+categorical = False
+numerical   = False
+
+[ramadanDayAfter]
+binary      = True
+categorical = False
+numerical   = False
+\ No newline at end of file
diff --git a/config/learn.cfg b/config/learn.cfg

index 82c67ddf5cc81797dc8edcb5ffe1178966565387..73379cf605405d0a8bccf95089daf343064ad5ee 100644 (file)
--- a/config/learn.cfg
+++ b/config/learn.cfg
@@ -1,33 +1,36 @@
  [DATETIME]
-start    = 01/01/2016 00:00:00
-end      = 12/31/2018 23:00:00
+start    = 01/01/2006 00:00:00
+end      = 12/31/2019 23:00:00
  hourStep = 3
  
  
  [FEATURES]
-meteofrance = True
  ephemeris   = True
  holidays    = True
+meteofrance = True
+ramadan     = True
  
  
  [FEATURE_CONFIG]
-meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg'
  ephemeris   = (Path.cwd() / 'config') / 'features' / 'feature_ephemeris.cfg'
  holidays    = (Path.cwd() / 'config') / 'features' / 'feature_holidays.cfg'
+meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg'
+ramadan     = (Path.cwd() / 'config') / 'features' / 'feature_ramadan.cfg'
  
  
  [PREPROCESSING]
-fill_method = spline
+fill_method = linear
  order       = 3
  
  
  [HISTORY_KNOWLEDGE]
-nb_lines = 5
+nb_lines = 24//3*7*4
  
  
  [TARGET]
-config = (Path.cwd() / 'config') / 'targets' / 'sdis25.cfg'
-
+config      = (Path.cwd() / 'config') / 'targets' / 'sdis25.cfg'
+cumulative  = True
+horizon     = 1
  
  [LEARNER]
  config =  (Path.cwd() / 'config') / 'learners' / 'xgboost.cfg'
 \ No newline at end of file
diff --git a/config/learners/lightgbm.cfg b/config/learners/lightgbm.cfg

new file mode 100644 (file)

index 0000000..ef062b3
--- /dev/null
+++ b/config/learners/lightgbm.cfg
@@ -0,0 +1,10 @@
+[MODEL]
+method = lightgbm
+
+[HYPERPARAMETERS]
+learning_rate  = 0.1
+metric         = auc
+num_iterations = 100
+num_round      = 10
+num_leaves     = 31
+objective      = poisson
diff --git a/config/learners/xgboost.cfg b/config/learners/xgboost.cfg

index d099aa417039d0fa98fb53e9591e582fa6bae86a..f9e53298ef6586768c7c5d7d3a55e697f604670b 100644 (file)
--- a/config/learners/xgboost.cfg
+++ b/config/learners/xgboost.cfg
@@ -6,6 +6,6 @@ method = xgboost
  learning_rate = 0.01
  max_depth     = 7
  random_state  = 42
-n_estimators  = 1000
+n_estimators  = 10000
  n_jobs        = -1
  objective     = 'count:poisson'
 \ No newline at end of file
diff --git a/main.py b/main.py

index 27f502aeaa676fd0a0500e2ca4f20a295f5d0ce9..d451534d130211ace744abc93379b9be5c7de0ef 100644 (file)
--- a/main.py
+++ b/main.py
@@ -11,14 +11,11 @@ logger = getLogger()
  if __name__ == '__main__':
  
      config = (Path.cwd() / 'config') / 'learn.cfg'
-    engine = Engine(config_file = config)
-
-    engine.add_features()
-    engine.add_target()
-
-    engine.add_preprocessing()
-
-    engine.learn()
+    with Engine(config_file = config) as e:
+        e.add_features()
+        e.add_target()
+        e.add_preprocessing()
+        e.learn()
  
      '''target = toarea(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
  
diff --git a/predictops/engine.py b/predictops/engine.py

index e7bbf1c5aa58221da7a8aaa71788cf0339258cbc..a63ef2932aee9086fb23ede8902ec4ea0820adb8 100644 (file)
--- a/predictops/engine.py
+++ b/predictops/engine.py
@@ -5,97 +5,150 @@ from logging.config import fileConfig
  from pathlib import Path
  from shutil import rmtree
  
+import os
+
+from .learn.learning import Learning
+from .learn.preprocessing import Preprocessing
  from .source.ephemeris import Ephemeris
  from .source.holidays import Holidays
+from .source.ramadan import Ramadan
  from .source.meteofrance import MeteoFrance
-from .learn.learning import Learning
-from .learn.preprocessing import Preprocessing
  from .target.target import Target
  
  fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  logger = getLogger()
  
  
-class Engine:
+class Engine(object):
  
-    def __init__(self, config_file = (Path.cwd() / 'config') / 'learn.cfg'):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        with open(str(self._file_name / os.path.basename(self._file_name)) + '.cfg', 'w') as f:
+            f.write(self._config_text)
+
+    def __init__(self, config_file=(Path.cwd() / 'config') / 'learn.cfg'):
          self._config = ConfigParser()
          self._config.read(config_file)
+        launching_time = datetime.strftime(datetime.now(), '%Y_%m_%d_%H_%M')
+        self._name = os.path.splitext(os.path.basename(eval(self._config['TARGET']['config'])))[0]
+        self._file_name = f"{self._name}-{launching_time}"
+        p = Path.cwd() / 'results' / self._name
+        p.mkdir(exist_ok=True, parents=True)
+        self._file_name = p / self._file_name
+
+        self._config_text = ''
+        with open(config_file) as f:
+            self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+            self._config_text += f.read() + '\n\n'
+
          self._start = datetime.strptime(self._config['DATETIME']['start'],
                                          '%m/%d/%Y %H:%M:%S')
          self._end = datetime.strptime(self._config['DATETIME']['end'],
-                                        '%m/%d/%Y %H:%M:%S')
+                                      '%m/%d/%Y %H:%M:%S')
  
-        self._timestep = timedelta(hours =
-                                   self._config['DATETIME'].getfloat('hourStep'))
+        self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep'))
  
          self._X = {}
  
-
-
      def clean(self):
          # Cleaning the data directory
          logger.info("Cleaning and restoring data directory")
-        directory  = Path.cwd() / 'data'
+        directory = Path.cwd() / 'data'
          if directory.is_dir():
              rmtree(directory)
          p = Path(Path.cwd() / 'data')
          p.mkdir()
  
-
      def add_features(self):
-        if self._config['FEATURES'].getboolean('meteofrance'):
-            meteofeature = MeteoFrance(config_file =
-                                       eval(self._config['FEATURE_CONFIG']['meteofrance']))
-
-            meteofeature.start = self._start
-            meteofeature.end = self._end
-
-            meteofeature.update()
-            dated_features = meteofeature.dated_features
-            for date in dated_features:
-                self._X.setdefault(date,{}).update(dated_features[date])
-
          if self._config['FEATURES'].getboolean('ephemeris'):
-            ephemerides = Ephemeris(config_file =
-                                    eval(self._config['FEATURE_CONFIG']['ephemeris']))
+            config_file = eval(self._config['FEATURE_CONFIG']['ephemeris'])
+            with open(config_file) as f:
+                self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+                self._config_text += f.read() + '\n\n'
+
+            ephemerides = Ephemeris(config_file=config_file)
  
              ephemerides.start = self._start
              ephemerides.end = self._end
  
              dated_features = ephemerides.dated_features
              for date in dated_features:
-                self._X.setdefault(date,{}).update(dated_features[date])
+                self._X.setdefault(date, {}).update(dated_features[date])
  
          if self._config['FEATURES'].getboolean('holidays'):
-            holidays = Holidays(config_file =
-                                eval(self._config['FEATURE_CONFIG']['holidays']))
+            config_file = eval(self._config['FEATURE_CONFIG']['holidays'])
+            with open(config_file) as f:
+                self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+                self._config_text += f.read() + '\n\n'
+
+            holidays = Holidays(config_file=config_file)
  
              holidays.start = self._start
              holidays.end = self._end
  
              dated_features = holidays.dated_features
              for date in dated_features:
-                self._X.setdefault(date,{}).update(dated_features[date])
+                self._X.setdefault(date, {}).update(dated_features[date])
+
+        if self._config['FEATURES'].getboolean('meteofrance'):
+            config_file = eval(self._config['FEATURE_CONFIG']['meteofrance'])
+            with open(config_file) as f:
+                self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+                self._config_text += f.read() + '\n\n'
  
+            meteofeature = MeteoFrance(config_file=config_file)
+
+            meteofeature.start = self._start
+            meteofeature.end = self._end
+
+            meteofeature.update()
+            dated_features = meteofeature.dated_features
+            for date in dated_features:
+                self._X.setdefault(date, {}).update(dated_features[date])
+
+        if self._config['FEATURES'].getboolean('ramadan'):
+            config_file = eval(self._config['FEATURE_CONFIG']['ramadan'])
+            with open(config_file) as f:
+                self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+                self._config_text += f.read() + '\n\n'
+
+            ramadan = Ramadan(config_file=config_file)
+
+            ramadan.start = self._start
+            ramadan.end = self._end
+
+            dated_features = ramadan.dated_features
+            for date in dated_features:
+                self._X.setdefault(date, {}).update(dated_features[date])
  
      def add_target(self):
-        self._target = Target(config_file = eval(self._config['TARGET']['config']),
-                              start = self._start, end = self._end,
-                              timestep = self._timestep)
+        config_file = eval(self._config['TARGET']['config'])
+        cumulative = self._config['TARGET'].getboolean('cumulative')
+        with open(config_file) as f:
+            self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+            self._config_text += f.read() + '\n\n'
  
+        self._target = Target(config_file=config_file,
+                              start=self._start, end=self._end,
+                              timestep=self._timestep, cumulative=cumulative)
  
      def add_preprocessing(self):
-        self._preproc = Preprocessing(config_file = self._config,
-                                      dict_features = self.X,
-                                      dict_target = self.y)
-
+        self._preproc = Preprocessing(config_file=self._config,
+                                      dict_features=self.X,
+                                      dict_target=self.y)
  
      def learn(self):
-        history = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines')
-        self._learner = Learning(config_file = eval(self._config['LEARNER']['config']),
-                                 X = self._preproc.dataframe, y = list(self.y.values())[history:])
+        config_file = eval(self._config['LEARNER']['config'])
+        with open(config_file) as f:
+            self._config_text += f"{'='*10} {os.path.basename(config_file)} {'='*10}\n\n"
+            self._config_text += f.read() + '\n\n'
  
+        history = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
+        self._learner = Learning(config_file=config_file, file_name=self._file_name,
+                                 X=self._preproc.dataframe, y=list(self.y.values())[history:],
+                                 horizon=self._config['TARGET'].getint('horizon'))
  
      @property
      def X(self):
@@ -105,7 +158,6 @@ class Engine:
      def X(self, x):
          self._X = x
  
-
      @property
      def y(self):
          return self._target.y
diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py

index 9a5860afaed8657890140c6dbffa79073bbe6787..959271ded77ad22aea436d98632ab7e97cf272d4 100644 (file)
--- a/predictops/learn/learning.py
+++ b/predictops/learn/learning.py
@@ -1,44 +1,120 @@
  from configparser import ConfigParser
+from logging import getLogger
+from logging.config import fileConfig
  from math import sqrt
+from pathlib import Path
  from sklearn.metrics import mean_squared_error, mean_absolute_error
  from sklearn.model_selection import train_test_split
+from statistics import mean, stdev
  
+import lightgbm as lgb
+import matplotlib
+import os
+import pylab as P
  import xgboost
  
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
  class Learning:
  
-    def __init__(self, config_file = None,
-                 X = None, y = None):
+    def __init__(self, config_file=None, file_name=None,
+                 X=None, y=None, horizon=0):
          self._config = ConfigParser()
          self._config.read(config_file)
+        self._file_name = file_name
+        logger.info("Dealing with the horizon of prediction")
+        self._X = X[:-horizon]
+        self._y = y[horizon:]
+        self._learn()
+        self._evaluate()
  
-        df = X
-        df['cible'] = y
-
-        train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)
-        train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42)
+    def _learn(self):
+        logger.info("Generation of learning sets")
+        self._df = self._X
+        self._df['cible'] = self._y
+        train_val_set, test_set = train_test_split(self._df, test_size=0.2, random_state=42)
+        train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=42)
  
-        X_test = test_set.drop('cible', axis = 1)
-        y_test = test_set['cible'].copy()
+        self._X_test = test_set.drop('cible', axis=1)
+        self._y_test = test_set['cible'].copy()
  
          X_train = train_set.drop('cible', axis=1)
          y_train = train_set['cible'].copy()
          X_val = val_set.drop('cible', axis=1)
          y_val = val_set['cible'].copy()
  
-
+        logger.info("Start learning")
          if self._config['MODEL']['method'] == 'xgboost':
+            logger.info("Using xgboost regressor")
+            self._reg = xgboost.XGBRegressor(learning_rate=self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
+                                             max_depth=self._config['HYPERPARAMETERS'].getint('max_depth'),
+                                             random_state=self._config['HYPERPARAMETERS'].getint('random_state'),
+                                             n_estimators=self._config['HYPERPARAMETERS'].getint('n_estimators'),
+                                             n_jobs=self._config['HYPERPARAMETERS'].getint('n_jobs'),
+                                             objective='count:poisson')
+
+            self._reg.fit(X_train, y_train,
+                          eval_set=[(X_val, y_val)],
+                          early_stopping_rounds=10)
+        elif self._config['MODEL']['method'] == 'lightgbm':
+            train_data = lgb.Dataset(X_train, label=y_train)
+            val_data = lgb.Dataset(X_val, label=y_val)
+            num_round = self._config['HYPERPARAMETERS'].getint('num_round')
+            param = {
+                'learning_rate': self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
+                'metric': self._config['HYPERPARAMETERS'].get('metric'),
+                'num_iterations': self._config['HYPERPARAMETERS'].getint('num_iterations'),
+                'num_leaves': self._config['HYPERPARAMETERS'].getint('num_leaves'),
+                'objective': self._config['HYPERPARAMETERS'].get('objective')
+            }
+            self._reg = lgb.train(param, train_data, num_round, valid_sets=[val_data])
+
+    def _evaluate(self):
+        logger.info("Evaluation of the learner")
+        y_test_pred = self._reg.predict(self._X_test)
+        txt = f"Average interventions per time unit: {mean(self._df.cible)}\n"
+        txt += f"Standard deviation: {stdev(self._df.cible)}\n\n"
  
-            xgb_reg = xgboost.XGBRegressor(learning_rate = self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
-                                           max_depth     = self._config['HYPERPARAMETERS'].getint('max_depth'),
-                                           random_state  = self._config['HYPERPARAMETERS'].getint('random_state'),
-                                           n_estimators  = self._config['HYPERPARAMETERS'].getint('n_estimators'),
-                                           n_jobs        = self._config['HYPERPARAMETERS'].getint('n_jobs'),
-                                           objective     = 'count:poisson')
+        txt += f"Mean absolute error: {mean_absolute_error(y_test_pred, self._y_test)}\n"
+        txt += f"Root mean squared error: {sqrt(mean_squared_error(y_test_pred, self._y_test))}\n\n"
  
-            xgb_reg.fit(X_train, y_train,
-                        eval_set=[(X_val, y_val)],
-                        early_stopping_rounds=10)
+        for k in range(10):
+            txt += f"Percentage of errors lower than {k}: {[abs(int(u-v))<=k for u,v in zip(self._y_test.values, y_test_pred)].count(True)/len(self._y_test)*100}\n"
  
-            y_test_pred = xgb_reg.predict(X_test)
-            print(sqrt(mean_squared_error(y_test_pred, y_test)), mean_absolute_error(y_test_pred,y_test))
-\ No newline at end of file
+        print(txt)
+        rep = (Path.cwd() / self._file_name)
+        rep.mkdir()
+        filename = str(self._file_name / os.path.basename(self._file_name))
+        with open(filename + ".result", 'w') as f:
+            f.write(txt)
+
+        y_true = self._df[self._df.year == self._df.year.max()].cible
+        x_true = self._df[self._df.year == self._df.year.max()].drop('cible', axis=1)
+
+        yy_test_pred = self._reg.predict(x_true)
+        P.figure(figsize=(36, 16))
+        P.plot(list(y_true)[:300], color='blue', label='actual')
+        P.plot(yy_test_pred[:300], color='red', label='predicted')
+        P.title('Predictions for 2018')
+        P.xlabel('Hour in the year')
+        P.ylabel('Number of cumulated interventions')
+        P.legend()
+        P.savefig(filename + ".png")
+
+        yy_test_pred = self._reg.predict(self._X_test)
+        P.figure(figsize=(36, 16))
+        P.plot(list(self._y_test)[:300], color='blue', label='actual')
+        P.plot(yy_test_pred[:300], color='red', label='predicted')
+        P.title('Predictions for test set')
+        P.xlabel('Hour in the year')
+        P.ylabel('Number of cumulated interventions')
+        P.legend()
+        P.savefig(filename + "-test.png")
+
+        if self._config['MODEL']['method'] == 'xgboost':
+            xgboost.plot_importance(self._reg)
+            fig = matplotlib.pyplot.gcf()
+            fig.set_size_inches(15, 130)
+            fig.savefig(filename + '-feat_importance.pdf')
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 885aad3393979b897e3e0d8c40f3378dbba08e5a..9bc09ad2eca2759c22b6047c3ded8ab747e015de 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -14,6 +14,7 @@ import pandas as pd
  fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  logger = getLogger()
  
+
  class Preprocessing:
      '''
      Generate a pandas dataframe from a dictionary of features per datetime, which
@@ -25,8 +26,8 @@ class Preprocessing:
       - NaN values are then filled with last known values.
      '''
  
-    def __init__(self, config_file = None,
-                 dict_features = None, dict_target = None):
+    def __init__(self, config_file=None,
+                 dict_features=None, dict_target=None):
          '''
          Constructor that defines all needed attributes and collects features.
          '''
@@ -35,9 +36,8 @@ class Preprocessing:
          self._start = datetime.strptime(self._config['DATETIME']['start'],
                                          '%m/%d/%Y %H:%M:%S')
          self._end = datetime.strptime(self._config['DATETIME']['end'],
-                                        '%m/%d/%Y %H:%M:%S')
-        self._timestep = timedelta(hours =
-                                   self._config['DATETIME'].getfloat('hourStep'))
+                                      '%m/%d/%Y %H:%M:%S')
+        self._timestep = timedelta(hours=self._config['DATETIME'].getfloat('hourStep'))
          self._dict_features = dict_features
          self._dict_target = dict_target
  
@@ -46,26 +46,28 @@ class Preprocessing:
          self._datetimes = []
  
          self._features = set(chain.from_iterable([tuple(u.keys())
-                                                      for u in [*dict_features.values()]]))
+                                                  for u in [*dict_features.values()]]))
  
          #feature_files = Path.cwd() / 'config' / 'features'
-        self._features = {feat : {'numerical': False, 'categorical': False}
+        self._features = {feat: {'numerical': False, 'categorical': False}
                            for feat in self._features}
  
          for feature in self._config['FEATURES']:
              if self._config['FEATURES'][feature]:
                  feature_file = self._config['FEATURE_CONFIG'][feature]
                  config = ConfigParser()
-                config.read(feature_file)
+                config.read(eval(feature_file))
                  for section in config:
                      if config.has_option(section, 'numerical'):
-                        self._features[section]['numerical'] = config[section].getboolean('numerical')
-                        self._features[section]['categorical'] = config[section].getboolean('categorical')
+                        for feature in self._features:
+                            if feature.split('_')[0] == section:
+                                self._features[feature]['binary'] = config[section].getboolean('binary')
+                                self._features[feature]['categorical'] = config[section].getboolean('categorical')
+                                self._features[feature]['numerical'] = config[section].getboolean('numerical')
  
-        self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
+        self._binary_columns = [k for k in self._features if self._features[k]['binary']]
          self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
-
-
+        self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
  
      @property
      def start(self):
@@ -75,7 +77,6 @@ class Preprocessing:
      def start(self, x):
          self._start = x
  
-
      @property
      def end(self):
          return self._end
@@ -84,7 +85,6 @@ class Preprocessing:
      def end(self, x):
          self._end = x
  
-
      @property
      def timestep(self):
          return self._timestep
@@ -93,7 +93,6 @@ class Preprocessing:
      def timestep(self, x):
          self._timestep = x
  
-
      def _fill_dict(self):
          '''
          Add datetime keys in the dated feature dictionary that are missing. The
@@ -105,16 +104,16 @@ class Preprocessing:
          while current <= self._end:
              self._datetimes.append(current)
              if current not in self._dict_features:
-                self._dict_features[current] = {feature:np.NaN
+                self._dict_features[current] = {feature: np.NaN
                                                  for feature in self._features}
              else:
-                null_dict = {feature:np.NaN
+                null_dict = {feature: np.NaN
                               for feature in self._features}
                  null_dict.update(self._dict_features[current])
                  self._dict_features[current] = null_dict
              current += self._timestep
          for k in self._dict_features:
-            null_dict = {feature:np.NaN
+            null_dict = {feature: np.NaN
                           for feature in self._features}
              null_dict.update(self._dict_features[k])
              self._dict_features[k] = null_dict
@@ -122,8 +121,6 @@ class Preprocessing:
          self._full_dict = {k: self._dict_features[k]
                             for k in sorted(self._dict_features.keys())}
  
-
-
      @property
      def full_dict(self):
          '''
@@ -133,7 +130,6 @@ class Preprocessing:
              self._fill_dict()
          return self._full_dict
  
-
      def _fill_nan(self):
          '''
          Fill NaN values, either by propagation or by interpolation (linear or splines)
@@ -150,7 +146,7 @@ class Preprocessing:
          elif self._config['PREPROCESSING']['fill_method'] == 'spline':
              self._dataframe[self._numerical_columns] =\
                  self._dataframe[self._numerical_columns].interpolate(method='spline',
-                     order=self._config['PREPROCESSING'].getint('order'))
+                                                                     order=self._config['PREPROCESSING'].getint('order'))
  
          # For the categorical columns, NaN values are filled by duplicating
          # the last known value (forward fill method)
@@ -173,21 +169,18 @@ class Preprocessing:
          self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
          logger.info("Rows dropped")
  
-
      def _add_history(self):
          '''
          Integrating previous nb of interventions as features
          '''
          logger.info("Integrating previous nb of interventions as features")
-        nb_lines = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines')
-        for k in range(1,nb_lines+1):
-            name = 'history_'+str(nb_lines-k+1)
-            self._dataframe[name] = [np.NaN]*k + list(self._dict_target.values())[:-k]
+        nb_lines = eval(self._config['HISTORY_KNOWLEDGE']['nb_lines'])
+        for k in range(1, nb_lines + 1):
+            name = 'history_' + str(nb_lines - k + 1)
+            self._dataframe[name] = [np.NaN] * k + list(self._dict_target.values())[:-k]
              self._numerical_columns.append(name)
          self._dataframe = self._dataframe[nb_lines:]
  
-
-
      def _standardize(self):
          '''
          Normalizing numerical features
@@ -197,26 +190,25 @@ class Preprocessing:
          self._dataframe[self._numerical_columns] =\
              preprocessing.scale(self._dataframe[self._numerical_columns])
  
-
-
      def _one_hot_encoding(self):
          '''
          Apply a one hot encoding for category features
          '''
          logger.info("One hot encoding for categorical feature")
-
          # We store numerical columns
          df_out = pd.DataFrame()
-        for col in  self._numerical_columns:
+        for col in self._numerical_columns:
+            df_out[col] = self._dataframe[col]
+        # Idem for binary features
+        for col in self._binary_columns:
              df_out[col] = self._dataframe[col]
          # The one hot encoding
          for col in self._categorical_columns:
-            pd1 = pd.get_dummies(self._dataframe[col],prefix=col)
+            pd1 = pd.get_dummies(self._dataframe[col], prefix=col)
              for col1 in pd1.columns:
                  df_out[col1] = pd1[col1]
          self._dataframe = df_out
  
-
      @property
      def dataframe(self):
          '''
@@ -236,7 +228,6 @@ class Preprocessing:
              self._one_hot_encoding()
          return self._dataframe
  
-
      @dataframe.setter
      def dataframe(self, df):
          self._dataframe = df
diff --git a/predictops/source/holidays.py b/predictops/source/holidays.py

index 1a536fe102e52b930314105006b8f3950fbd9f64..6893db03ef5ef7eb804d3f1d9ec391cf18a2dcaf 100644 (file)
--- a/predictops/source/holidays.py
+++ b/predictops/source/holidays.py
@@ -3,6 +3,7 @@ from datetime import datetime, timedelta
  from jours_feries_france.compute import JoursFeries
  from logging import getLogger
  from logging.config import fileConfig
+from pathlib import Path
  from vacances_scolaires_france import SchoolHolidayDates
  
  import itertools
@@ -90,17 +91,28 @@ class Holidays:
              bankHolidaysEve = tuple(u-timedelta(days=1) for u in bankHolidays)
              name = self._config['ZONE']['name']
              date = self._start
+            Date = datetime.date(date)
+            tomorrow = date + timedelta(days=1)
+            Tomorrow = datetime.date(tomorrow)
              d = SchoolHolidayDates()
+            dict_hour = {
+                'bankHolidays' : Date in bankHolidays,
+                'bankHolidaysEve': Date in bankHolidaysEve,
+                'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)),
+                'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow))
+            }
              while date <= self._end:
-                Date = datetime.date(date)
-                tomorrow = date + timedelta(days=1)
-                Tomorrow = datetime.date(tomorrow)
-                dict_hour = {
-                    'bankHolidays' : Date in bankHolidays,
-                    'bankHolidaysEve': Date in bankHolidaysEve,
-                    'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)),
-                    'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow))
-                }
                  self._dated_features[date] = dict_hour
+                current = date
                  date += timedelta(hours=1)
+                if date.day != current.day:
+                    Date = datetime.date(date)
+                    tomorrow = date + timedelta(days=1)
+                    Tomorrow = datetime.date(tomorrow)
+                    dict_hour = {
+                        'bankHolidays' : Date in bankHolidays,
+                        'bankHolidaysEve': Date in bankHolidaysEve,
+                        'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)),
+                        'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow))
+                    }
          return self._dated_features
 \ No newline at end of file
diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py

index ff6a238c534a1d4aa5ab807e0b85a43554722c86..0edd49f544f33ed07b17c93dc7b1e493a6d0e6e5 100644 (file)
--- a/predictops/source/meteofrance.py
+++ b/predictops/source/meteofrance.py
@@ -247,7 +247,7 @@ class MeteoFrance:
                  if (date >= self._start and date <= self._end)\
                  or (date.year == self._start.year and date.month == self._start.month)\
                  or (date.year == self._end.year and date.month == self._end.month):
-                    logger.info(f'Inserting {csv_meteo} in intervention dictionary')
+                    logger.info(f'Adding meteofrance features from {csv_meteo}')
                      with open(dir_data / csv_meteo, "r") as f:
                          reader = DictReader(f, delimiter=';')
                          for row in reader:
diff --git a/predictops/source/ramadan.py b/predictops/source/ramadan.py

new file mode 100644 (file)

index 0000000..6836df1
--- /dev/null
+++ b/predictops/source/ramadan.py
@@ -0,0 +1,71 @@
+from configparser import ConfigParser
+from convertdate import islamic
+from datetime import datetime, timedelta
+from logging import getLogger
+from logging.config import fileConfig
+from pathlib import Path
+
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
+class Ramadan:
+
+    _start = None
+    _end = None
+
+    def __init__(self, config_file):
+
+        self._config = ConfigParser()
+        self._config.read(config_file)
+
+        # Collecting holidays features
+        self._features = [section for section in self._config
+                          if self._config[section].getboolean('numerical')
+                          or self._config[section].getboolean('categorical')]
+
+        self._dated_features = {}
+
+    @property
+    def start(self):
+        return self._start
+
+    @start.setter
+    def start(self, x):
+        self._start = x
+
+    @property
+    def end(self):
+        return self._end
+
+    @end.setter
+    def end(self, x):
+        self._end = x
+
+    @property
+    def dated_features(self):
+        if self._dated_features == {}:
+            logger.info("Adding Ramadan features")
+            date = self._start
+            while date <= self._end:
+                year, month, day = date.year, date.month, date.day
+                eve = datetime(year, month, day) - timedelta(days=1)
+                tomorrow = datetime(year, month, day) + timedelta(days=1)
+                Hegirian_month = islamic.from_gregorian(year, month, day)[1]
+                dict_hour = {
+                    'ramadanEve': False,
+                    'ramadan': False,
+                    'ramadanDayAfter': False
+                }
+                if Hegirian_month == 8 and\
+                   islamic.from_gregorian(tomorrow.year, tomorrow.month, tomorrow.day)[1] == 9:
+                    dict_hour['ramadanEve'] = True
+                elif Hegirian_month == 9:
+                    dict_hour['ramadan'] = True
+                elif Hegirian_month == 10 and\
+                        islamic.from_gregorian(eve.year, eve.month, eve.day)[1] == 9:
+                    dict_hour['ramadanDayAfter'] = True
+                self._dated_features[date] = dict_hour
+                date += timedelta(hours=1)
+        return self._dated_features
diff --git a/predictops/target/target.py b/predictops/target/target.py

index b998120b1be6b3b5524e8263c155370b6edceb98..9e3d86d31de82f24ad8a8c2bd6abf6e742f446fe 100644 (file)
--- a/predictops/target/target.py
+++ b/predictops/target/target.py
@@ -12,7 +12,7 @@ logger = getLogger()
  class Target:
  
      def __init__(self, config_file = None,
-                 start = None, end = None, timestep = None):
+                 start = None, end = None, timestep = None, cumulative = None):
  
          self._config = ConfigParser()
          self._config.read(config_file)
@@ -20,6 +20,7 @@ class Target:
          self._start = start
          self._end = end
          self._timestep = timestep
+        self._cumulative = cumulative
  
          logger.info('Initialization of target variable')
          self._y = {}
@@ -68,36 +69,39 @@ class Target:
              logger.info('Integrating interventions for the whole area')
              with open(self._stream_file) as f:
                  reader = DictReader(f, delimiter=',')
-                for row in reader:
-                    if row['start'] != '':
-                        start_interv = datetime.strptime(row['start'], '%d/%m/%Y %H:%M:%S')
-                        start_interv = start_interv.replace(minute=0)
-                        end_interv = datetime.strptime(row['end'], '%d/%m/%Y %H:%M:%S')
-                        end_interv = end_interv.replace(minute=0)
-                        if not (start_interv > self._end or end_interv < self._start):
-                            if start_interv < self._start and end_interv <= self._end:
-                                current = self._start
-                                while current <= end_interv:
-                                    self._y[current] += 1
-                                    current += self._timestep
-                            elif start_interv >= self._start and end_interv > self._end:
-                                current = start_interv
-                                while current not in self._y:
-                                    current -= timedelta(hours=1)
-                                while current <= self._end:
-                                    self._y[current] += 1
-                                    current += self._timestep
-                            elif start_interv >= self._start and end_interv <= self._end:
-                                current = start_interv
-                                while current not in self._y:
-                                    current -= timedelta(hours=1)
-                                while current <= end_interv:
-                                    self._y[current] += 1
-                                    current += self._timestep
-
-
-
-
-
+                if self._cumulative:
+                    for row in reader:
+                        if row['start'] != '':
+                            start_interv = datetime.strptime(row['start'], '%d/%m/%Y %H:%M:%S')
+                            start_interv = start_interv.replace(minute=0)
+                            end_interv = datetime.strptime(row['end'], '%d/%m/%Y %H:%M:%S')
+                            end_interv = end_interv.replace(minute=0)
+                            if not (start_interv > self._end or end_interv < self._start):
+                                if start_interv < self._start and end_interv <= self._end:
+                                    current = self._start
+                                    while current <= end_interv:
+                                        self._y[current] += 1
+                                        current += self._timestep
+                                elif start_interv >= self._start and end_interv > self._end:
+                                    current = start_interv
+                                    while current not in self._y:
+                                        current -= timedelta(hours=1)
+                                    while current <= self._end:
+                                        self._y[current] += 1
+                                        current += self._timestep
+                                elif start_interv >= self._start and end_interv <= self._end:
+                                    current = start_interv
+                                    while current not in self._y:
+                                        current -= timedelta(hours=1)
+                                    while current <= end_interv:
+                                        self._y[current] += 1
+                                        current += self._timestep
+                else:
+                    for row in reader:
+                        if row['start'] != '':
+                            start_interv = datetime.strptime(row['start'], '%d/%m/%Y %H:%M:%S')
+                            start_interv = start_interv.replace(minute=0)
+                            if start_interv in self._y:
+                                self._y[start_interv] += 1
  
  
diff --git a/requirements.txt b/requirements.txt

index 3e40e81e4f54c2c996a71f4146c13a9ff589b8f8..6615f1fc4c5a72ac226089c20e20c6f9babcaef1 100644 (file)
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,14 +2,22 @@ attrs==19.3.0
  Click==7.0
  click-plugins==1.1.1
  cligj==0.5.0
+convertdate==2.2.0
+cycler==0.10.0
  Fiona==1.8.13
  geographiclib==1.50
  geopandas==0.6.3
  geopy==1.21.0
  joblib==0.14.1
+jours-feries-france==0.5.1
+kiwisolver==1.1.0
+lightgbm==2.3.1
+matplotlib==3.1.3
  munch==2.5.0
  numpy==1.18.1
  pandas==1.0.1
+PyMeeus==0.3.6
+pyparsing==2.4.6
  pyproj==2.4.2.post1
  python-dateutil==2.8.1
  pytz==2019.3
@@ -17,5 +25,6 @@ scikit-learn==0.22.1
  scipy==1.4.1
  Shapely==1.7.0
  six==1.14.0
+vacances-scolaires-france==0.7.0
  xgboost==0.90
  xlrd==1.2.0
author	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Sun, 23 Feb 2020 08:39:21 +0000 (09:39 +0100)
committer	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Sun, 23 Feb 2020 08:39:21 +0000 (09:39 +0100)
config/features/feature_ephemeris.cfg		patch \| blob \| history
config/features/feature_holidays.cfg		patch \| blob \| history
config/features/feature_meteo.cfg		patch \| blob \| history
config/features/feature_ramadan.cfg	[new file with mode: 0644]	patch \| blob
config/learn.cfg		patch \| blob \| history
config/learners/lightgbm.cfg	[new file with mode: 0644]	patch \| blob
config/learners/xgboost.cfg		patch \| blob \| history
main.py		patch \| blob \| history
predictops/engine.py		patch \| blob \| history
predictops/learn/learning.py		patch \| blob \| history
predictops/learn/preprocessing.py		patch \| blob \| history
predictops/source/holidays.py		patch \| blob \| history
predictops/source/meteofrance.py		patch \| blob \| history
predictops/source/ramadan.py	[new file with mode: 0644]	patch \| blob
predictops/target/target.py		patch \| blob \| history
requirements.txt		patch \| blob \| history