]> AND Private Git Repository - predictops.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Reducing the computation time and adding holidays features
authorChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Thu, 20 Feb 2020 11:03:33 +0000 (12:03 +0100)
committerChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Thu, 20 Feb 2020 11:03:33 +0000 (12:03 +0100)
14 files changed:
config/features/ephemeris_features.csv [deleted file]
config/features/feature_ephemeris.cfg
config/features/feature_holidays.cfg [new file with mode: 0644]
config/features/feature_meteo.cfg
config/features/meteofrance_features.csv [deleted file]
config/learn.cfg
config/learners/xgboost.cfg
predictops/engine.py
predictops/learn/learning.py
predictops/learn/preprocessing.py
predictops/source/ephemeris.py
predictops/source/holidays.py [new file with mode: 0644]
predictops/source/meteofrance.py
predictops/source/source.py [deleted file]

diff --git a/config/features/ephemeris_features.csv b/config/features/ephemeris_features.csv
deleted file mode 100644 (file)
index 4b75f8a..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-name,type
-hour,3
-dayInWeek,3
-dayInMonth,2
-dayInYear,3
-weekInYear,3
-month,3
-year,3
index ddd9f8b506775c9f7ebd631085e35cf8ffe7865d..decc7871efc82357e6b9469d787f07c25fbd5474 100644 (file)
@@ -1,26 +1,27 @@
-[FEATURES]
-hour       = True
-dayInWeek  = True
-dayInMonth = True
-dayInYear  = True
-weekInYear = True
-month      = True
-year       = True
-
 [hour]
-numerical  = False
+categorical = True
+numerical   = False
 
 [dayInWeek]
-numerical  = False
+categorical = True
+numerical   = False
+
+[dayInMonth]
+categorical = True
+numerical   = False
 
 [dayInYear]
-numerical  = False
+categorical = True
+numerical   = False
 
 [weekInYear]
-numerical  = False
+categorical = True
+numerical   = False
 
 [month]
-numerical  = True
+categorical = False
+numerical   = True
 
 [year]
-numerical  = True
\ No newline at end of file
+categorical = False
+numerical   = True
\ No newline at end of file
diff --git a/config/features/feature_holidays.cfg b/config/features/feature_holidays.cfg
new file mode 100644 (file)
index 0000000..c3b3063
--- /dev/null
@@ -0,0 +1,18 @@
+[ZONE]
+name = Besançon
+
+[bankHolidays]
+categorical = True
+numerical   = False
+
+[bankHolidaysEve]
+categorical = True
+numerical   = False
+
+[holidays]
+categorical = True
+numerical   = False
+
+[holidaysEve]
+categorical = True
+numerical   = False
index 02bdab784da6cd73f56e5894f1383f534be1ab58..04f6c620eb65d7f621c5fd22f90588d6be4beee4 100644 (file)
@@ -9,17 +9,67 @@ longitude = 6.0333
 [STATIONS]
 nb_stations = 3
 
-[FEATURES]
-temperature            = True
-pressure               = True
-pressureVariation      = False
-barometricTrend        = False
-humidity               = False
-dewPoint               = False
-lastHourRainfall       = False
-last3hHourRainfall     = False
-meanWindSpeed10min     = False
-meanWindDirection10min = False
-gustsOverAPeriod       = False
-horizontalVisibility   = False
-currentWeather         = False
\ No newline at end of file
+[temperature]
+abbreviation = t
+categorical  = False
+numerical    = True
+
+[pressure]
+abbreviation = pres
+categorical  = False
+numerical    = True
+
+[pressureVariation]
+abbreviation = tend
+categorical  = False
+numerical    = True
+
+[barometricTrend]
+abbreviation = cod_tend
+categorical  = True
+numerical    = False
+
+[humidity]
+abbreviation = u
+categorical  = False
+numerical    = True
+
+[dewPoint]
+abbreviation = td
+categorical  = False
+numerical    = True
+
+[lastHourRainfall]
+abbreviation = rr1
+categorical  = False
+numerical    = True
+
+[last3hHourRainfall]
+abbreviation = rr3
+categorical  = False
+numerical    = True
+
+[meanWindSpeed10min]
+abbreviation = ff
+categorical  = False
+numerical    = True
+
+[meanWindDirection10min]
+abbreviation = dd
+categorical  = False
+numerical    = True
+
+[gustsOverAPeriod]
+abbreviation = rafper
+categorical  = False
+numerical    = True
+
+[horizontalVisibility]
+abbreviation = vv
+categorical  = False
+numerical    = True
+
+[currentWeather]
+abbreviation = ww
+categorical  = True
+numerical    = False
\ No newline at end of file
diff --git a/config/features/meteofrance_features.csv b/config/features/meteofrance_features.csv
deleted file mode 100644 (file)
index 8623303..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-abbreviation,name,unit,format,type
-t,temperature,K,real,1
-pres,pressure,Pa,integer,1
-tend,pressureVariation,Pa,integer,1
-cod_tend,barometricTrend,code,integer,2
-u,humidity,%,integer,1
-td,dewPoint,K,real,1
-rr1,lastHourRainfall,mm,real,1
-rr3,last3hHourRainfall,mm,real,1
-ff,meanWindSpeed10min,m/s,real,1
-dd,meanWindDirection10min,degré,integer,1
-rafper,gustsOverAPeriod,m/s,real,1
-vv,horizontalVisibility,m,real,1
-ww,currentWeather,code,integer,2
index 29bd628119eb2aad019dab9847538df1a8e30fef..82c67ddf5cc81797dc8edcb5ffe1178966565387 100644 (file)
@@ -1,17 +1,19 @@
 [DATETIME]
-start    = 01/01/2010 01:00:00
-end      = 12/31/2017 23:00:00
-hourStep = 5
+start    = 01/01/2016 00:00:00
+end      = 12/31/2018 23:00:00
+hourStep = 3
 
 
 [FEATURES]
 meteofrance = True
 ephemeris   = True
+holidays    = True
 
 
 [FEATURE_CONFIG]
 meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg'
 ephemeris   = (Path.cwd() / 'config') / 'features' / 'feature_ephemeris.cfg'
+holidays    = (Path.cwd() / 'config') / 'features' / 'feature_holidays.cfg'
 
 
 [PREPROCESSING]
index 0dd78d0194d1f6585b2c342557e6a8ed035f54ee..d099aa417039d0fa98fb53e9591e582fa6bae86a 100644 (file)
@@ -1,10 +1,11 @@
 [MODEL]
 method = xgboost
 
+
 [HYPERPARAMETERS]
-learning_rate = 0.01,
-max_depth = 10,
-random_state=42,
-n_estimators = 173,
-n_jobs=-1,
-objective = 'count:poisson'
\ No newline at end of file
+learning_rate = 0.01
+max_depth     = 7
+random_state  = 42
+n_estimators  = 1000
+n_jobs        = -1
+objective     = 'count:poisson'
\ No newline at end of file
index f87e82e833fbd87bcc091f4dc568108a9bf86a21..e7bbf1c5aa58221da7a8aaa71788cf0339258cbc 100644 (file)
@@ -6,6 +6,7 @@ from pathlib import Path
 from shutil import rmtree
 
 from .source.ephemeris import Ephemeris
+from .source.holidays import Holidays
 from .source.meteofrance import MeteoFrance
 from .learn.learning import Learning
 from .learn.preprocessing import Preprocessing
@@ -66,6 +67,17 @@ class Engine:
             for date in dated_features:
                 self._X.setdefault(date,{}).update(dated_features[date])
 
+        if self._config['FEATURES'].getboolean('holidays'):
+            holidays = Holidays(config_file =
+                                eval(self._config['FEATURE_CONFIG']['holidays']))
+
+            holidays.start = self._start
+            holidays.end = self._end
+
+            dated_features = holidays.dated_features
+            for date in dated_features:
+                self._X.setdefault(date,{}).update(dated_features[date])
+
 
     def add_target(self):
         self._target = Target(config_file = eval(self._config['TARGET']['config']),
index 416450047e8cddda860da6be9296e11c17e0004c..9a5860afaed8657890140c6dbffa79073bbe6787 100644 (file)
@@ -15,8 +15,6 @@ class Learning:
         df = X
         df['cible'] = y
 
-        print(df.head())
-
         train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)
         train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42)
 
@@ -30,12 +28,13 @@ class Learning:
 
 
         if self._config['MODEL']['method'] == 'xgboost':
-            xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01,
-                                                   max_depth = 10,
-                                                   random_state=42,
-                                                   n_estimators = 173,
-                                                   n_jobs=-1,
-                                                   objective = 'count:poisson')
+
+            xgb_reg = xgboost.XGBRegressor(learning_rate = self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
+                                           max_depth     = self._config['HYPERPARAMETERS'].getint('max_depth'),
+                                           random_state  = self._config['HYPERPARAMETERS'].getint('random_state'),
+                                           n_estimators  = self._config['HYPERPARAMETERS'].getint('n_estimators'),
+                                           n_jobs        = self._config['HYPERPARAMETERS'].getint('n_jobs'),
+                                           objective     = 'count:poisson')
 
             xgb_reg.fit(X_train, y_train,
                         eval_set=[(X_val, y_val)],
index 106a6267c3aa804aca024c471e5c7b6e29805799..885aad3393979b897e3e0d8c40f3378dbba08e5a 100644 (file)
@@ -48,28 +48,22 @@ class Preprocessing:
         self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
 
-        feature_files = Path.cwd() / 'config' / 'features'
-        self._features = {feat : {'numerical': False} for feat in self._features}
-        for feature_file in listdir(feature_files):
-            if feature_file.endswith('csv'):
-                with open(feature_files / feature_file , "r") as f:
-                    reader = DictReader(f, delimiter=',')
-                    typed_names = {row['name']: row['type'] for row in reader}
-                for feature in self._features:
-                    if feature.split('_')[0] in typed_names:
-                        self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
-            elif feature_file.endswith('cfg'):
+        #feature_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : {'numerical': False, 'categorical': False}
+                          for feat in self._features}
+
+        for feature in self._config['FEATURES']:
+            if self._config['FEATURES'][feature]:
+                feature_file = self._config['FEATURE_CONFIG'][feature]
                 config = ConfigParser()
-                config.read(feature_files / feature_file)
+                config.read(feature_file)
                 for section in config:
                     if config.has_option(section, 'numerical'):
                         self._features[section]['numerical'] = config[section].getboolean('numerical')
+                        self._features[section]['categorical'] = config[section].getboolean('categorical')
 
-        self._numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
-                   or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
-
-        self._categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
-                   or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+        self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
+        self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
 
 
 
@@ -172,12 +166,11 @@ class Preprocessing:
         # Dropping rows that are not related to our datetime window (start/
         # step / end)
         logger.info("Dropping rows that are not related to our datetime window")
-        self._dataframe['datetime'] =\
-            self._dataframe.apply(lambda x: datetime(int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)), axis=1)
+        dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
         self._dataframe['row_ok'] =\
-            self._dataframe.apply(lambda x:x.datetime in self._datetimes, axis=1)
+            self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
         self._dataframe = self._dataframe[self._dataframe['row_ok']]
-        self._dataframe = self._dataframe.drop(['datetime', 'row_ok'], axis=1)
+        self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
         logger.info("Rows dropped")
 
 
index 2a343642bdb3f8959365c71f4b4692c33955e527..e1c07ecfe604de8012715c25a773c2a66fed7bb7 100644 (file)
@@ -1,14 +1,14 @@
-from .source import Source
-
 from configparser import ConfigParser
-from csv import DictReader
 from datetime import datetime, timedelta
+from logging import getLogger
+from logging.config import fileConfig
 from pathlib import Path
 
 import time
 import calendar
 
-CSV_FILE = Path.cwd() / 'config' / 'features' / 'ephemeris_features.csv'
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
 
 class Ephemeris:
 
@@ -17,17 +17,13 @@ class Ephemeris:
 
     def __init__(self, config_file):
 
-        # Check for the integrity of feature names
-        Source.__init__(self)
-
         self._config = ConfigParser()
         self._config.read(config_file)
 
         # Collecting ephemeris features
-        with open(CSV_FILE, "r") as f:
-            reader = DictReader(f, delimiter=',')
-            self._features = [row['name'] for row in reader
-                              if self._config['FEATURES'].getboolean(row['name'])]
+        self._features = [section for section in self._config
+                              if self._config[section].getboolean('numerical')
+                              or self._config[section].getboolean('categorical')]
 
         self._dated_features = {}
 
@@ -50,10 +46,10 @@ class Ephemeris:
         self._end = x
 
 
-
     @property
     def dated_features(self):
         if self._dated_features == {}:
+            logger.info("Adding ephemeris features")
             date = self._start
             while date <= self._end:
                 dict_hour = {}
diff --git a/predictops/source/holidays.py b/predictops/source/holidays.py
new file mode 100644 (file)
index 0000000..1a536fe
--- /dev/null
@@ -0,0 +1,106 @@
+from configparser import ConfigParser
+from datetime import datetime, timedelta
+from jours_feries_france.compute import JoursFeries
+from logging import getLogger
+from logging.config import fileConfig
+from vacances_scolaires_france import SchoolHolidayDates
+
+import itertools
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+class Holidays:
+
+    _start = None
+    _end   = None
+
+    def __init__(self, config_file):
+
+        self._config = ConfigParser()
+        self._config.read(config_file)
+
+        # Collecting holidays features
+        self._features = [section for section in self._config
+                              if self._config[section].getboolean('numerical')
+                              or self._config[section].getboolean('categorical')]
+
+        self._dated_features = {}
+
+
+    @property
+    def start(self):
+        return self._start
+
+    @start.setter
+    def start(self, x):
+        self._start = x
+
+
+    @property
+    def end(self):
+        return self._end
+
+    @end.setter
+    def end(self, x):
+        self._end = x
+
+
+
+    def _get_academic_zone(self, name, date):
+        dict_zones = {
+        'Caen' : ('A', 'B'),
+        'Clermont-Ferrand' : ('A', 'A'),
+        'Grenoble' : ('A', 'A'),
+        'Lyon' : ('A', 'A'),
+        'Montpellier' : ('A', 'C'),
+        'Nancy-Metz' : ('A', 'B'),
+        'Nantes' : ('A', 'B'),
+        'Rennes' : ('A', 'B'),
+        'Toulouse' : ('A', 'C'),
+        'Aix-Marseille' : ('B', 'B'),
+        'Amiens' : ('B', 'B'),
+        'Besançon' : ('B', 'A'),
+        'Dijon' : ('B', 'A'),
+        'Lille' : ('B', 'B'),
+        'Limoges' : ('B', 'A'),
+        'Nice' : ('B', 'B'),
+        'Orléans-Tours' : ('B', 'B'),
+        'Poitiers' : ('B', 'A'),
+        'Reims' : ('B', 'B'),
+        'Rouen ' : ('B', 'B'),
+        'Strasbourg' : ('B', 'B'),
+        'Bordeaux' : ('C', 'A'),
+        'Créteil' : ('C', 'C'),
+        'Paris' : ('C', 'C'),
+        'Versailles' : ('C', 'C')
+        }
+        if date < datetime(2016, 1, 1):
+            return dict_zones[name][0]
+        else:
+            return dict_zones[name][1]
+
+
+    @property
+    def dated_features(self):
+        if self._dated_features == {}:
+            logger.info("Adding holidays features")
+            bankHolidays = tuple(itertools.chain.from_iterable(list(JoursFeries.for_year(k).values())
+                                           for k in range(self.start.year, self.end.year+1)))
+            bankHolidaysEve = tuple(u-timedelta(days=1) for u in bankHolidays)
+            name = self._config['ZONE']['name']
+            date = self._start
+            d = SchoolHolidayDates()
+            while date <= self._end:
+                Date = datetime.date(date)
+                tomorrow = date + timedelta(days=1)
+                Tomorrow = datetime.date(tomorrow)
+                dict_hour = {
+                    'bankHolidays' : Date in bankHolidays,
+                    'bankHolidaysEve': Date in bankHolidaysEve,
+                    'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)),
+                    'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow))
+                }
+                self._dated_features[date] = dict_hour
+                date += timedelta(hours=1)
+        return self._dated_features
\ No newline at end of file
index b26c6bf6525f0a87d1cba73d251e7937c89beac6..ff6a238c534a1d4aa5ab807e0b85a43554722c86 100644 (file)
@@ -1,5 +1,3 @@
-from .source import Source
-
 from configparser import ConfigParser
 from csv import DictReader
 from datetime import datetime
@@ -18,10 +16,8 @@ import gzip
 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
 logger = getLogger()
 
-CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
-
 
-class MeteoFrance(Source):
+class MeteoFrance:
 
     _latitude    = None
     _longitude   = None
@@ -33,28 +29,7 @@ class MeteoFrance(Source):
     def __init__(self, config_file):
         '''
         Constructor of the MeteoFrance source of feature.
-
-        - It will reinitiate the data directory, if asked in the config
-          features.cfg file.
-        - It searches for the nb_stations meteo stations closest to the provided
-          point (longitude and latitude)
-
-        For more information about this source of feature, see:
-    https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32
-
-        Parameters:
-          - in config file:
-            latitude (float): The latitude from which we want the meteo features.
-            longitude (float): The longitude from which we want the meteo features.
-            nb_stations (int): Number of closest stations to consider.
-          - provided to the constructor
-            features (list): Weather features that have to be integrated, according
-                  to their names in meteofrance_features.csv (cf. config directory)
-
         '''
-        # Check for the integrity of feature names
-        Source.__init__(self)
-
         self._config = ConfigParser()
         self._config.read(config_file)
 
@@ -74,10 +49,11 @@ class MeteoFrance(Source):
         self._stations = self._get_stations()
 
         # Collecting meteofrance features
-        with open(CSV_FILE, "r") as f:
-            reader = DictReader(f, delimiter=',')
-            self._features = [row['name'] for row in reader
-                              if self._config['FEATURES'].getboolean(row['name'])]
+        self._features = [section for section in self._config
+                          if self._config.has_option(section, 'numerical')
+                          and (self._config[section]['numerical'] or
+                               self._config[section]['categorical'])]
+
 
 
     @property
@@ -255,21 +231,18 @@ class MeteoFrance(Source):
             dict: the dictionary of features per datestamp
         '''
         if self._dated_features == None:
-            logger.info(f'Collecting meteo feature information from {CSV_FILE}')
+            logger.info('Collecting meteofrance feature information')
             # A dictionary for the features
-            with open(CSV_FILE, "r") as f:
-                reader = DictReader(f, delimiter=',')
-                dico_features = {row["abbreviation"]:
-                                   {
-                                       'name': row['name'], # feature name
-                                       'type': row['type']  # qualitative (2) or quantitative (1)
-                                    }
-                                for row in reader if row['name'] in self._features}
-                #print([row for row in reader])
-                #print([row for row in reader if row['name'] in self._features])
+            dico_features = {self._config[section]["abbreviation"]:
+                               {
+                                   'name': section, # feature name
+                                   'numerical': self._config[section]['numerical'],
+                                   'categorical': self._config[section]['categorical']
+                                }
+                            for section in self._features}
             dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical'
             self._dated_features = {}
-            for csv_meteo in listdir(dir_data):
+            for csv_meteo in sorted(listdir(dir_data)):
                 date = datetime.strptime(csv_meteo.split('.')[1], '%Y%m')
                 if (date >= self._start and date <= self._end)\
                 or (date.year == self._start.year and date.month == self._start.month)\
diff --git a/predictops/source/source.py b/predictops/source/source.py
deleted file mode 100644 (file)
index 70f24da..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-from configparser import ConfigParser
-from csv import DictReader
-from logging import getLogger
-from logging.config import fileConfig
-from os import listdir
-from pathlib import Path
-
-fileConfig((Path.cwd() / 'config') / 'logging.cfg')
-logger = getLogger()
-
-
-class Source:
-    def __init__(self):
-        '''
-        Check if the same feature name is used in two different feature sources,
-        and if the sources of type 3 (being both categorical and numerical) have
-        a specified type in the feature_...cfg file
-        '''
-        logger.info('Check for redondant feature names')
-        feature_files = Path.cwd() / 'config' / 'features'
-        list_of_names = []
-        for file_name in listdir(feature_files ):
-            if file_name.endswith('csv'):
-                with open(feature_files  / file_name, "r") as f:
-                    reader = DictReader(f, delimiter=',')
-                    list_of_names.extend([row['name'] for row in reader])
-
-        if len(list_of_names) != len(set(list_of_names)):
-            raise ValueError("At least two features have the same name")
-
-        logger.info('Check for specified feature types')
-        names_of_mixed_types = []
-        for file_name in listdir(feature_files):
-            if file_name.endswith('csv'):
-                with open(feature_files  / file_name, "r") as f:
-                    reader = DictReader(f, delimiter=',')
-                    names_of_mixed_types.extend([row['name'] for row in reader
-                                                 if row['type'] == '3'])
-
-        cfg_names_of_mixed_types = []
-        for file_name in listdir(feature_files):
-            if file_name.endswith('cfg'):
-                config = ConfigParser()
-                config.read(feature_files / file_name)
-                for section in config:
-                    if config.has_option(section, 'numerical'):
-                        cfg_names_of_mixed_types.append(section)
-
-        if sorted(names_of_mixed_types) != sorted(cfg_names_of_mixed_types):
-            raise ValueError(f"Problem with features of mixed types: "
-                             f"{set(names_of_mixed_types).symmetric_difference(cfg_names_of_mixed_types)}")