From: Christophe Guyeux Date: Thu, 20 Feb 2020 11:03:33 +0000 (+0100) Subject: Reducing the computation time and adding holidays features X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/a2faba3f0797b7be72d0c8fa9cb9db67456136d6?hp=964c1b87a6996c828c150a2b06a827350a4c2b10 Reducing the computation time and adding holidays features --- diff --git a/config/features/ephemeris_features.csv b/config/features/ephemeris_features.csv deleted file mode 100644 index 4b75f8a..0000000 --- a/config/features/ephemeris_features.csv +++ /dev/null @@ -1,8 +0,0 @@ -name,type -hour,3 -dayInWeek,3 -dayInMonth,2 -dayInYear,3 -weekInYear,3 -month,3 -year,3 diff --git a/config/features/feature_ephemeris.cfg b/config/features/feature_ephemeris.cfg index ddd9f8b..decc787 100644 --- a/config/features/feature_ephemeris.cfg +++ b/config/features/feature_ephemeris.cfg @@ -1,26 +1,27 @@ -[FEATURES] -hour = True -dayInWeek = True -dayInMonth = True -dayInYear = True -weekInYear = True -month = True -year = True - [hour] -numerical = False +categorical = True +numerical = False [dayInWeek] -numerical = False +categorical = True +numerical = False + +[dayInMonth] +categorical = True +numerical = False [dayInYear] -numerical = False +categorical = True +numerical = False [weekInYear] -numerical = False +categorical = True +numerical = False [month] -numerical = True +categorical = False +numerical = True [year] -numerical = True \ No newline at end of file +categorical = False +numerical = True \ No newline at end of file diff --git a/config/features/feature_holidays.cfg b/config/features/feature_holidays.cfg new file mode 100644 index 0000000..c3b3063 --- /dev/null +++ b/config/features/feature_holidays.cfg @@ -0,0 +1,18 @@ +[ZONE] +name = Besançon + +[bankHolidays] +categorical = True +numerical = False + +[bankHolidaysEve] +categorical = True +numerical = False + +[holidays] +categorical = True +numerical = False + +[holidaysEve] +categorical = True +numerical = False diff --git a/config/features/feature_meteo.cfg b/config/features/feature_meteo.cfg index 02bdab7..04f6c62 100644 --- a/config/features/feature_meteo.cfg +++ b/config/features/feature_meteo.cfg @@ -9,17 +9,67 @@ longitude = 6.0333 [STATIONS] nb_stations = 3 -[FEATURES] -temperature = True -pressure = True -pressureVariation = False -barometricTrend = False -humidity = False -dewPoint = False -lastHourRainfall = False -last3hHourRainfall = False -meanWindSpeed10min = False -meanWindDirection10min = False -gustsOverAPeriod = False -horizontalVisibility = False -currentWeather = False \ No newline at end of file +[temperature] +abbreviation = t +categorical = False +numerical = True + +[pressure] +abbreviation = pres +categorical = False +numerical = True + +[pressureVariation] +abbreviation = tend +categorical = False +numerical = True + +[barometricTrend] +abbreviation = cod_tend +categorical = True +numerical = False + +[humidity] +abbreviation = u +categorical = False +numerical = True + +[dewPoint] +abbreviation = td +categorical = False +numerical = True + +[lastHourRainfall] +abbreviation = rr1 +categorical = False +numerical = True + +[last3hHourRainfall] +abbreviation = rr3 +categorical = False +numerical = True + +[meanWindSpeed10min] +abbreviation = ff +categorical = False +numerical = True + +[meanWindDirection10min] +abbreviation = dd +categorical = False +numerical = True + +[gustsOverAPeriod] +abbreviation = rafper +categorical = False +numerical = True + +[horizontalVisibility] +abbreviation = vv +categorical = False +numerical = True + +[currentWeather] +abbreviation = ww +categorical = True +numerical = False \ No newline at end of file diff --git a/config/features/meteofrance_features.csv b/config/features/meteofrance_features.csv deleted file mode 100644 index 8623303..0000000 --- a/config/features/meteofrance_features.csv +++ /dev/null @@ -1,14 +0,0 @@ -abbreviation,name,unit,format,type -t,temperature,K,real,1 -pres,pressure,Pa,integer,1 -tend,pressureVariation,Pa,integer,1 -cod_tend,barometricTrend,code,integer,2 -u,humidity,%,integer,1 -td,dewPoint,K,real,1 -rr1,lastHourRainfall,mm,real,1 -rr3,last3hHourRainfall,mm,real,1 -ff,meanWindSpeed10min,m/s,real,1 -dd,meanWindDirection10min,degré,integer,1 -rafper,gustsOverAPeriod,m/s,real,1 -vv,horizontalVisibility,m,real,1 -ww,currentWeather,code,integer,2 diff --git a/config/learn.cfg b/config/learn.cfg index 29bd628..82c67dd 100644 --- a/config/learn.cfg +++ b/config/learn.cfg @@ -1,17 +1,19 @@ [DATETIME] -start = 01/01/2010 01:00:00 -end = 12/31/2017 23:00:00 -hourStep = 5 +start = 01/01/2016 00:00:00 +end = 12/31/2018 23:00:00 +hourStep = 3 [FEATURES] meteofrance = True ephemeris = True +holidays = True [FEATURE_CONFIG] meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg' ephemeris = (Path.cwd() / 'config') / 'features' / 'feature_ephemeris.cfg' +holidays = (Path.cwd() / 'config') / 'features' / 'feature_holidays.cfg' [PREPROCESSING] diff --git a/config/learners/xgboost.cfg b/config/learners/xgboost.cfg index 0dd78d0..d099aa4 100644 --- a/config/learners/xgboost.cfg +++ b/config/learners/xgboost.cfg @@ -1,10 +1,11 @@ [MODEL] method = xgboost + [HYPERPARAMETERS] -learning_rate = 0.01, -max_depth = 10, -random_state=42, -n_estimators = 173, -n_jobs=-1, -objective = 'count:poisson' \ No newline at end of file +learning_rate = 0.01 +max_depth = 7 +random_state = 42 +n_estimators = 1000 +n_jobs = -1 +objective = 'count:poisson' \ No newline at end of file diff --git a/predictops/engine.py b/predictops/engine.py index f87e82e..e7bbf1c 100644 --- a/predictops/engine.py +++ b/predictops/engine.py @@ -6,6 +6,7 @@ from pathlib import Path from shutil import rmtree from .source.ephemeris import Ephemeris +from .source.holidays import Holidays from .source.meteofrance import MeteoFrance from .learn.learning import Learning from .learn.preprocessing import Preprocessing @@ -66,6 +67,17 @@ class Engine: for date in dated_features: self._X.setdefault(date,{}).update(dated_features[date]) + if self._config['FEATURES'].getboolean('holidays'): + holidays = Holidays(config_file = + eval(self._config['FEATURE_CONFIG']['holidays'])) + + holidays.start = self._start + holidays.end = self._end + + dated_features = holidays.dated_features + for date in dated_features: + self._X.setdefault(date,{}).update(dated_features[date]) + def add_target(self): self._target = Target(config_file = eval(self._config['TARGET']['config']), diff --git a/predictops/learn/learning.py b/predictops/learn/learning.py index 4164500..9a5860a 100644 --- a/predictops/learn/learning.py +++ b/predictops/learn/learning.py @@ -15,8 +15,6 @@ class Learning: df = X df['cible'] = y - print(df.head()) - train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42) train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42) @@ -30,12 +28,13 @@ class Learning: if self._config['MODEL']['method'] == 'xgboost': - xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01, - max_depth = 10, - random_state=42, - n_estimators = 173, - n_jobs=-1, - objective = 'count:poisson') + + xgb_reg = xgboost.XGBRegressor(learning_rate = self._config['HYPERPARAMETERS'].getfloat('learning_rate'), + max_depth = self._config['HYPERPARAMETERS'].getint('max_depth'), + random_state = self._config['HYPERPARAMETERS'].getint('random_state'), + n_estimators = self._config['HYPERPARAMETERS'].getint('n_estimators'), + n_jobs = self._config['HYPERPARAMETERS'].getint('n_jobs'), + objective = 'count:poisson') xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index 106a626..885aad3 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -48,28 +48,22 @@ class Preprocessing: self._features = set(chain.from_iterable([tuple(u.keys()) for u in [*dict_features.values()]])) - feature_files = Path.cwd() / 'config' / 'features' - self._features = {feat : {'numerical': False} for feat in self._features} - for feature_file in listdir(feature_files): - if feature_file.endswith('csv'): - with open(feature_files / feature_file , "r") as f: - reader = DictReader(f, delimiter=',') - typed_names = {row['name']: row['type'] for row in reader} - for feature in self._features: - if feature.split('_')[0] in typed_names: - self._features[feature]['type'] = int(typed_names[feature.split('_')[0]]) - elif feature_file.endswith('cfg'): + #feature_files = Path.cwd() / 'config' / 'features' + self._features = {feat : {'numerical': False, 'categorical': False} + for feat in self._features} + + for feature in self._config['FEATURES']: + if self._config['FEATURES'][feature]: + feature_file = self._config['FEATURE_CONFIG'][feature] config = ConfigParser() - config.read(feature_files / feature_file) + config.read(feature_file) for section in config: if config.has_option(section, 'numerical'): self._features[section]['numerical'] = config[section].getboolean('numerical') + self._features[section]['categorical'] = config[section].getboolean('categorical') - self._numerical_columns = [k for k in self._features if self._features[k]['type'] == 1 - or (self._features[k]['type'] == 3 and self._features[k]['numerical'])] - - self._categorical_columns = [k for k in self._features if self._features[k]['type'] == 2 - or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])] + self._numerical_columns = [k for k in self._features if self._features[k]['numerical']] + self._categorical_columns = [k for k in self._features if self._features[k]['categorical']] @@ -172,12 +166,11 @@ class Preprocessing: # Dropping rows that are not related to our datetime window (start/ # step / end) logger.info("Dropping rows that are not related to our datetime window") - self._dataframe['datetime'] =\ - self._dataframe.apply(lambda x: datetime(int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)), axis=1) + dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes) self._dataframe['row_ok'] =\ - self._dataframe.apply(lambda x:x.datetime in self._datetimes, axis=1) + self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1) self._dataframe = self._dataframe[self._dataframe['row_ok']] - self._dataframe = self._dataframe.drop(['datetime', 'row_ok'], axis=1) + self._dataframe = self._dataframe.drop(['row_ok'], axis=1) logger.info("Rows dropped") diff --git a/predictops/source/ephemeris.py b/predictops/source/ephemeris.py index 2a34364..e1c07ec 100644 --- a/predictops/source/ephemeris.py +++ b/predictops/source/ephemeris.py @@ -1,14 +1,14 @@ -from .source import Source - from configparser import ConfigParser -from csv import DictReader from datetime import datetime, timedelta +from logging import getLogger +from logging.config import fileConfig from pathlib import Path import time import calendar -CSV_FILE = Path.cwd() / 'config' / 'features' / 'ephemeris_features.csv' +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() class Ephemeris: @@ -17,17 +17,13 @@ class Ephemeris: def __init__(self, config_file): - # Check for the integrity of feature names - Source.__init__(self) - self._config = ConfigParser() self._config.read(config_file) # Collecting ephemeris features - with open(CSV_FILE, "r") as f: - reader = DictReader(f, delimiter=',') - self._features = [row['name'] for row in reader - if self._config['FEATURES'].getboolean(row['name'])] + self._features = [section for section in self._config + if self._config[section].getboolean('numerical') + or self._config[section].getboolean('categorical')] self._dated_features = {} @@ -50,10 +46,10 @@ class Ephemeris: self._end = x - @property def dated_features(self): if self._dated_features == {}: + logger.info("Adding ephemeris features") date = self._start while date <= self._end: dict_hour = {} diff --git a/predictops/source/holidays.py b/predictops/source/holidays.py new file mode 100644 index 0000000..1a536fe --- /dev/null +++ b/predictops/source/holidays.py @@ -0,0 +1,106 @@ +from configparser import ConfigParser +from datetime import datetime, timedelta +from jours_feries_france.compute import JoursFeries +from logging import getLogger +from logging.config import fileConfig +from vacances_scolaires_france import SchoolHolidayDates + +import itertools + +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + +class Holidays: + + _start = None + _end = None + + def __init__(self, config_file): + + self._config = ConfigParser() + self._config.read(config_file) + + # Collecting holidays features + self._features = [section for section in self._config + if self._config[section].getboolean('numerical') + or self._config[section].getboolean('categorical')] + + self._dated_features = {} + + + @property + def start(self): + return self._start + + @start.setter + def start(self, x): + self._start = x + + + @property + def end(self): + return self._end + + @end.setter + def end(self, x): + self._end = x + + + + def _get_academic_zone(self, name, date): + dict_zones = { + 'Caen' : ('A', 'B'), + 'Clermont-Ferrand' : ('A', 'A'), + 'Grenoble' : ('A', 'A'), + 'Lyon' : ('A', 'A'), + 'Montpellier' : ('A', 'C'), + 'Nancy-Metz' : ('A', 'B'), + 'Nantes' : ('A', 'B'), + 'Rennes' : ('A', 'B'), + 'Toulouse' : ('A', 'C'), + 'Aix-Marseille' : ('B', 'B'), + 'Amiens' : ('B', 'B'), + 'Besançon' : ('B', 'A'), + 'Dijon' : ('B', 'A'), + 'Lille' : ('B', 'B'), + 'Limoges' : ('B', 'A'), + 'Nice' : ('B', 'B'), + 'Orléans-Tours' : ('B', 'B'), + 'Poitiers' : ('B', 'A'), + 'Reims' : ('B', 'B'), + 'Rouen ' : ('B', 'B'), + 'Strasbourg' : ('B', 'B'), + 'Bordeaux' : ('C', 'A'), + 'Créteil' : ('C', 'C'), + 'Paris' : ('C', 'C'), + 'Versailles' : ('C', 'C') + } + if date < datetime(2016, 1, 1): + return dict_zones[name][0] + else: + return dict_zones[name][1] + + + @property + def dated_features(self): + if self._dated_features == {}: + logger.info("Adding holidays features") + bankHolidays = tuple(itertools.chain.from_iterable(list(JoursFeries.for_year(k).values()) + for k in range(self.start.year, self.end.year+1))) + bankHolidaysEve = tuple(u-timedelta(days=1) for u in bankHolidays) + name = self._config['ZONE']['name'] + date = self._start + d = SchoolHolidayDates() + while date <= self._end: + Date = datetime.date(date) + tomorrow = date + timedelta(days=1) + Tomorrow = datetime.date(tomorrow) + dict_hour = { + 'bankHolidays' : Date in bankHolidays, + 'bankHolidaysEve': Date in bankHolidaysEve, + 'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)), + 'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow)) + } + self._dated_features[date] = dict_hour + date += timedelta(hours=1) + return self._dated_features \ No newline at end of file diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py index b26c6bf..ff6a238 100644 --- a/predictops/source/meteofrance.py +++ b/predictops/source/meteofrance.py @@ -1,5 +1,3 @@ -from .source import Source - from configparser import ConfigParser from csv import DictReader from datetime import datetime @@ -18,10 +16,8 @@ import gzip fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() -CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv' - -class MeteoFrance(Source): +class MeteoFrance: _latitude = None _longitude = None @@ -33,28 +29,7 @@ class MeteoFrance(Source): def __init__(self, config_file): ''' Constructor of the MeteoFrance source of feature. - - - It will reinitiate the data directory, if asked in the config - features.cfg file. - - It searches for the nb_stations meteo stations closest to the provided - point (longitude and latitude) - - For more information about this source of feature, see: - https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32 - - Parameters: - - in config file: - latitude (float): The latitude from which we want the meteo features. - longitude (float): The longitude from which we want the meteo features. - nb_stations (int): Number of closest stations to consider. - - provided to the constructor - features (list): Weather features that have to be integrated, according - to their names in meteofrance_features.csv (cf. config directory) - ''' - # Check for the integrity of feature names - Source.__init__(self) - self._config = ConfigParser() self._config.read(config_file) @@ -74,10 +49,11 @@ class MeteoFrance(Source): self._stations = self._get_stations() # Collecting meteofrance features - with open(CSV_FILE, "r") as f: - reader = DictReader(f, delimiter=',') - self._features = [row['name'] for row in reader - if self._config['FEATURES'].getboolean(row['name'])] + self._features = [section for section in self._config + if self._config.has_option(section, 'numerical') + and (self._config[section]['numerical'] or + self._config[section]['categorical'])] + @property @@ -255,21 +231,18 @@ class MeteoFrance(Source): dict: the dictionary of features per datestamp ''' if self._dated_features == None: - logger.info(f'Collecting meteo feature information from {CSV_FILE}') + logger.info('Collecting meteofrance feature information') # A dictionary for the features - with open(CSV_FILE, "r") as f: - reader = DictReader(f, delimiter=',') - dico_features = {row["abbreviation"]: - { - 'name': row['name'], # feature name - 'type': row['type'] # qualitative (2) or quantitative (1) - } - for row in reader if row['name'] in self._features} - #print([row for row in reader]) - #print([row for row in reader if row['name'] in self._features]) + dico_features = {self._config[section]["abbreviation"]: + { + 'name': section, # feature name + 'numerical': self._config[section]['numerical'], + 'categorical': self._config[section]['categorical'] + } + for section in self._features} dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical' self._dated_features = {} - for csv_meteo in listdir(dir_data): + for csv_meteo in sorted(listdir(dir_data)): date = datetime.strptime(csv_meteo.split('.')[1], '%Y%m') if (date >= self._start and date <= self._end)\ or (date.year == self._start.year and date.month == self._start.month)\ diff --git a/predictops/source/source.py b/predictops/source/source.py deleted file mode 100644 index 70f24da..0000000 --- a/predictops/source/source.py +++ /dev/null @@ -1,51 +0,0 @@ -from configparser import ConfigParser -from csv import DictReader -from logging import getLogger -from logging.config import fileConfig -from os import listdir -from pathlib import Path - -fileConfig((Path.cwd() / 'config') / 'logging.cfg') -logger = getLogger() - - -class Source: - def __init__(self): - ''' - Check if the same feature name is used in two different feature sources, - and if the sources of type 3 (being both categorical and numerical) have - a specified type in the feature_...cfg file - ''' - logger.info('Check for redondant feature names') - feature_files = Path.cwd() / 'config' / 'features' - list_of_names = [] - for file_name in listdir(feature_files ): - if file_name.endswith('csv'): - with open(feature_files / file_name, "r") as f: - reader = DictReader(f, delimiter=',') - list_of_names.extend([row['name'] for row in reader]) - - if len(list_of_names) != len(set(list_of_names)): - raise ValueError("At least two features have the same name") - - logger.info('Check for specified feature types') - names_of_mixed_types = [] - for file_name in listdir(feature_files): - if file_name.endswith('csv'): - with open(feature_files / file_name, "r") as f: - reader = DictReader(f, delimiter=',') - names_of_mixed_types.extend([row['name'] for row in reader - if row['type'] == '3']) - - cfg_names_of_mixed_types = [] - for file_name in listdir(feature_files): - if file_name.endswith('cfg'): - config = ConfigParser() - config.read(feature_files / file_name) - for section in config: - if config.has_option(section, 'numerical'): - cfg_names_of_mixed_types.append(section) - - if sorted(names_of_mixed_types) != sorted(cfg_names_of_mixed_types): - raise ValueError(f"Problem with features of mixed types: " - f"{set(names_of_mixed_types).symmetric_difference(cfg_names_of_mixed_types)}")