From: Christophe Guyeux Date: Tue, 18 Feb 2020 12:39:49 +0000 (+0100) Subject: Learning process: first version X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/288baa6ff06c1b815ec24d164770acc93ac80499 Learning process: first version --- diff --git a/config/learn.cfg b/config/learn.cfg index 1a9566e..53c62de 100644 --- a/config/learn.cfg +++ b/config/learn.cfg @@ -1,7 +1,7 @@ [DATETIME] -start = 01/01/2010 01:00:00 -end = 12/31/2010 23:00:00 -hourStep = 6 +start = 01/01/2006 00:00:00 +end = 12/31/2017 23:00:00 +hourStep = 1 [FEATURES] @@ -25,3 +25,7 @@ nb_lines = 5 [TARGET] config = (Path.cwd() / 'config') / 'targets' / 'sdis25.cfg' + + +[LEARNER] +config = (Path.cwd() / 'config') / 'learners' / 'xgboost.cfg' \ No newline at end of file diff --git a/config/learners/xgboost.cfg b/config/learners/xgboost.cfg new file mode 100644 index 0000000..61975c2 --- /dev/null +++ b/config/learners/xgboost.cfg @@ -0,0 +1,2 @@ +[MODEL] +method = xgboost \ No newline at end of file diff --git a/main.py b/main.py index cf8fe81..27f502a 100644 --- a/main.py +++ b/main.py @@ -18,6 +18,7 @@ if __name__ == '__main__': engine.add_preprocessing() + engine.learn() '''target = toarea(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv') diff --git a/predictops/engine.py b/predictops/engine.py index 44ab9c4..f87e82e 100644 --- a/predictops/engine.py +++ b/predictops/engine.py @@ -7,6 +7,7 @@ from shutil import rmtree from .source.ephemeris import Ephemeris from .source.meteofrance import MeteoFrance +from .learn.learning import Learning from .learn.preprocessing import Preprocessing from .target.target import Target @@ -73,12 +74,16 @@ class Engine: def add_preprocessing(self): - process = Preprocessing(config_file = self._config, - dict_features = self.X, - dict_target = self.y) - print(process.dataframe.head(n=2)) + self._preproc = Preprocessing(config_file = self._config, + dict_features = self.X, + dict_target = self.y) + def learn(self): + history = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines') + self._learner = Learning(config_file = eval(self._config['LEARNER']['config']), + X = self._preproc.dataframe, y = list(self.y.values())[history:]) + @property def X(self): diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py new file mode 100644 index 0000000..4164500 --- /dev/null +++ b/predictops/learn/learning.py @@ -0,0 +1,45 @@ +from configparser import ConfigParser +from math import sqrt +from sklearn.metrics import mean_squared_error, mean_absolute_error +from sklearn.model_selection import train_test_split + +import xgboost + +class Learning: + + def __init__(self, config_file = None, + X = None, y = None): + self._config = ConfigParser() + self._config.read(config_file) + + df = X + df['cible'] = y + + print(df.head()) + + train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42) + train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42) + + X_test = test_set.drop('cible', axis = 1) + y_test = test_set['cible'].copy() + + X_train = train_set.drop('cible', axis=1) + y_train = train_set['cible'].copy() + X_val = val_set.drop('cible', axis=1) + y_val = val_set['cible'].copy() + + + if self._config['MODEL']['method'] == 'xgboost': + xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01, + max_depth = 10, + random_state=42, + n_estimators = 173, + n_jobs=-1, + objective = 'count:poisson') + + xgb_reg.fit(X_train, y_train, + eval_set=[(X_val, y_val)], + early_stopping_rounds=10) + + y_test_pred = xgb_reg.predict(X_test) + print(sqrt(mean_squared_error(y_test_pred, y_test)), mean_absolute_error(y_test_pred,y_test)) \ No newline at end of file diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index 187a5b7..51ecb4e 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -182,15 +182,11 @@ class Preprocessing: ''' logger.info("Integrating previous nb of interventions as features") nb_lines = self._config['HISTORY_KNOWLEDGE'].getint('nb_lines') - print(len(self._dataframe)) - print(self._dataframe.head(4)) for k in range(1,nb_lines+1): name = 'history_'+str(nb_lines-k+1) self._dataframe[name] = [np.NaN]*k + list(self._dict_target.values())[:-k] self._numerical_columns.append(name) self._dataframe = self._dataframe[nb_lines:] - print(self._dataframe.head(4)) - print(len(self._dataframe)) diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py index 6bd23ed..b26c6bf 100644 --- a/predictops/source/meteofrance.py +++ b/predictops/source/meteofrance.py @@ -58,6 +58,9 @@ class MeteoFrance(Source): self._config = ConfigParser() self._config.read(config_file) + self._latitude = self._config['POSITION'].getfloat('latitude') + self._longitude = self._config['POSITION'].getfloat('longitude') + self._data_directory = (Path.cwd() / 'data') / 'features' / 'meteo_france' self._dated_features = None diff --git a/requirements.txt b/requirements.txt index 6249a16..3e40e81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,5 @@ scikit-learn==0.22.1 scipy==1.4.1 Shapely==1.7.0 six==1.14.0 +xgboost==0.90 xlrd==1.2.0