Learning process: first version

author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Tue, 18 Feb 2020 12:39:49 +0000 (13:39 +0100)

committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Tue, 18 Feb 2020 12:39:49 +0000 (13:39 +0100)
author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Tue, 18 Feb 2020 12:39:49 +0000 (13:39 +0100)
committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Tue, 18 Feb 2020 12:39:49 +0000 (13:39 +0100)
diff --git a/config/learn.cfg b/config/learn.cfg

index 1a9566eee07f32774bf5d8655cab5c3eb4b591c2..53c62deeb541804097e3a30cb8ee04449038aff2 100644 (file)
--- a/config/learn.cfg
+++ b/config/learn.cfg
@@ -1,7 +1,7 @@
  [DATETIME]
-start    = 01/01/2010 01:00:00
-end      = 12/31/2010 23:00:00
-hourStep = 6
+start    = 01/01/2006 00:00:00
+end      = 12/31/2017 23:00:00
+hourStep = 1
  
  
  [FEATURES]
@@ -25,3 +25,7 @@ nb_lines = 5
  
  [TARGET]
  config = (Path.cwd() / 'config') / 'targets' / 'sdis25.cfg'
+
+
+[LEARNER]
+config =  (Path.cwd() / 'config') / 'learners' / 'xgboost.cfg'
+\ No newline at end of file
diff --git a/config/learners/xgboost.cfg b/config/learners/xgboost.cfg

new file mode 100644 (file)

index 0000000..61975c2
--- /dev/null
+++ b/config/learners/xgboost.cfg
@@ -0,0 +1,2 @@
+[MODEL]
+method = xgboost
+\ No newline at end of file
diff --git a/main.py b/main.py

index cf8fe81d13940c996d9abb1344c9a296d6f199f1..27f502aeaa676fd0a0500e2ca4f20a295f5d0ce9 100644 (file)
--- a/main.py
+++ b/main.py
@@ -18,6 +18,7 @@ if __name__ == '__main__':
  
      engine.add_preprocessing()
  
+    engine.learn()
  
      '''target = toarea(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
  
diff --git a/predictops/engine.py b/predictops/engine.py

index 44ab9c4c08bc089fd53038f5cd84fe0b4d557ef1..f87e82e833fbd87bcc091f4dc568108a9bf86a21 100644 (file)
--- a/predictops/engine.py
+++ b/predictops/engine.py
@@ -7,6 +7,7 @@ from shutil import rmtree
  
  from .source.ephemeris import Ephemeris
  from .source.meteofrance import MeteoFrance
+from .learn.learning import Learning
  from .learn.preprocessing import Preprocessing
  from .target.target import Target
  
@@ -73,12 +74,16 @@ class Engine:
  
  
      def add_preprocessing(self):
-        process = Preprocessing(config_file = self._config,
-                                dict_features = self.X,
-                                dict_target = self.y)
-        print(process.dataframe.head(n=2))
+        self._preproc = Preprocessing(config_file = self._config,
+                                      dict_features = self.X,
+                                      dict_target = self.y)
  
  
+    def learn(self):
+        history = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines')
+        self._learner = Learning(config_file = eval(self._config['LEARNER']['config']),
+                                 X = self._preproc.dataframe, y = list(self.y.values())[history:])
+
  
      @property
      def X(self):
diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py

new file mode 100644 (file)

index 0000000..4164500
--- /dev/null
+++ b/predictops/learn/learning.py
@@ -0,0 +1,45 @@
+from configparser import ConfigParser
+from math import sqrt
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from sklearn.model_selection import train_test_split
+
+import xgboost
+
+class Learning:
+
+    def __init__(self, config_file = None,
+                 X = None, y = None):
+        self._config = ConfigParser()
+        self._config.read(config_file)
+
+        df = X
+        df['cible'] = y
+
+        print(df.head())
+
+        train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)
+        train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42)
+
+        X_test = test_set.drop('cible', axis = 1)
+        y_test = test_set['cible'].copy()
+
+        X_train = train_set.drop('cible', axis=1)
+        y_train = train_set['cible'].copy()
+        X_val = val_set.drop('cible', axis=1)
+        y_val = val_set['cible'].copy()
+
+
+        if self._config['MODEL']['method'] == 'xgboost':
+            xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01,
+                                                   max_depth = 10,
+                                                   random_state=42,
+                                                   n_estimators = 173,
+                                                   n_jobs=-1,
+                                                   objective = 'count:poisson')
+
+            xgb_reg.fit(X_train, y_train,
+                        eval_set=[(X_val, y_val)],
+                        early_stopping_rounds=10)
+
+            y_test_pred = xgb_reg.predict(X_test)
+            print(sqrt(mean_squared_error(y_test_pred, y_test)), mean_absolute_error(y_test_pred,y_test))
+\ No newline at end of file
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 187a5b73b664da58031d45d55839548a10ec1be1..51ecb4e162ff77b804a6a9e4790c4e9da34e1410 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -182,15 +182,11 @@ class Preprocessing:
          '''
          logger.info("Integrating previous nb of interventions as features")
          nb_lines = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines')
-        print(len(self._dataframe))
-        print(self._dataframe.head(4))
          for k in range(1,nb_lines+1):
              name = 'history_'+str(nb_lines-k+1)
              self._dataframe[name] = [np.NaN]*k + list(self._dict_target.values())[:-k]
              self._numerical_columns.append(name)
          self._dataframe = self._dataframe[nb_lines:]
-        print(self._dataframe.head(4))
-        print(len(self._dataframe))
  
  
  
diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py

index 6bd23edc1435b857c5a2a00150778870ad30ebd7..b26c6bf6525f0a87d1cba73d251e7937c89beac6 100644 (file)
--- a/predictops/source/meteofrance.py
+++ b/predictops/source/meteofrance.py
@@ -58,6 +58,9 @@ class MeteoFrance(Source):
          self._config = ConfigParser()
          self._config.read(config_file)
  
+        self._latitude = self._config['POSITION'].getfloat('latitude')
+        self._longitude = self._config['POSITION'].getfloat('longitude')
+
          self._data_directory = (Path.cwd() / 'data') / 'features' / 'meteo_france'
  
          self._dated_features = None
diff --git a/requirements.txt b/requirements.txt

index 6249a161c878fe217583f52d727df621886c4b38..3e40e81e4f54c2c996a71f4146c13a9ff589b8f8 100644 (file)
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,4 +17,5 @@ scikit-learn==0.22.1
  scipy==1.4.1
  Shapely==1.7.0
  six==1.14.0
+xgboost==0.90
  xlrd==1.2.0
author	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Tue, 18 Feb 2020 12:39:49 +0000 (13:39 +0100)
committer	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Tue, 18 Feb 2020 12:39:49 +0000 (13:39 +0100)
config/learn.cfg		patch \| blob \| history
config/learners/xgboost.cfg	[new file with mode: 0644]	patch \| blob
main.py		patch \| blob \| history
predictops/engine.py		patch \| blob \| history
predictops/learn/learning.py	[new file with mode: 0644]	patch \| blob
predictops/learn/preprocessing.py		patch \| blob \| history
predictops/source/meteofrance.py		patch \| blob \| history
requirements.txt		patch \| blob \| history