+++ /dev/null
-name,type
-hour,3
-dayInWeek,3
-dayInMonth,2
-dayInYear,3
-weekInYear,3
-month,3
-year,3
-[FEATURES]
-hour = True
-dayInWeek = True
-dayInMonth = True
-dayInYear = True
-weekInYear = True
-month = True
-year = True
-
[hour]
-numerical = False
+categorical = True
+numerical = False
[dayInWeek]
-numerical = False
+categorical = True
+numerical = False
+
+[dayInMonth]
+categorical = True
+numerical = False
[dayInYear]
-numerical = False
+categorical = True
+numerical = False
[weekInYear]
-numerical = False
+categorical = True
+numerical = False
[month]
-numerical = True
+categorical = False
+numerical = True
[year]
-numerical = True
\ No newline at end of file
+categorical = False
+numerical = True
\ No newline at end of file
--- /dev/null
+[ZONE]
+name = Besançon
+
+[bankHolidays]
+categorical = True
+numerical = False
+
+[bankHolidaysEve]
+categorical = True
+numerical = False
+
+[holidays]
+categorical = True
+numerical = False
+
+[holidaysEve]
+categorical = True
+numerical = False
[STATIONS]
nb_stations = 3
-[FEATURES]
-temperature = True
-pressure = True
-pressureVariation = False
-barometricTrend = False
-humidity = False
-dewPoint = False
-lastHourRainfall = False
-last3hHourRainfall = False
-meanWindSpeed10min = False
-meanWindDirection10min = False
-gustsOverAPeriod = False
-horizontalVisibility = False
-currentWeather = False
\ No newline at end of file
+[temperature]
+abbreviation = t
+categorical = False
+numerical = True
+
+[pressure]
+abbreviation = pres
+categorical = False
+numerical = True
+
+[pressureVariation]
+abbreviation = tend
+categorical = False
+numerical = True
+
+[barometricTrend]
+abbreviation = cod_tend
+categorical = True
+numerical = False
+
+[humidity]
+abbreviation = u
+categorical = False
+numerical = True
+
+[dewPoint]
+abbreviation = td
+categorical = False
+numerical = True
+
+[lastHourRainfall]
+abbreviation = rr1
+categorical = False
+numerical = True
+
+[last3hHourRainfall]
+abbreviation = rr3
+categorical = False
+numerical = True
+
+[meanWindSpeed10min]
+abbreviation = ff
+categorical = False
+numerical = True
+
+[meanWindDirection10min]
+abbreviation = dd
+categorical = False
+numerical = True
+
+[gustsOverAPeriod]
+abbreviation = rafper
+categorical = False
+numerical = True
+
+[horizontalVisibility]
+abbreviation = vv
+categorical = False
+numerical = True
+
+[currentWeather]
+abbreviation = ww
+categorical = True
+numerical = False
\ No newline at end of file
+++ /dev/null
-abbreviation,name,unit,format,type
-t,temperature,K,real,1
-pres,pressure,Pa,integer,1
-tend,pressureVariation,Pa,integer,1
-cod_tend,barometricTrend,code,integer,2
-u,humidity,%,integer,1
-td,dewPoint,K,real,1
-rr1,lastHourRainfall,mm,real,1
-rr3,last3hHourRainfall,mm,real,1
-ff,meanWindSpeed10min,m/s,real,1
-dd,meanWindDirection10min,degré,integer,1
-rafper,gustsOverAPeriod,m/s,real,1
-vv,horizontalVisibility,m,real,1
-ww,currentWeather,code,integer,2
[DATETIME]
-start = 01/01/2010 01:00:00
-end = 12/31/2017 23:00:00
-hourStep = 5
+start = 01/01/2016 00:00:00
+end = 12/31/2018 23:00:00
+hourStep = 3
[FEATURES]
meteofrance = True
ephemeris = True
+holidays = True
[FEATURE_CONFIG]
meteofrance = (Path.cwd() / 'config') / 'features' / 'feature_meteo.cfg'
ephemeris = (Path.cwd() / 'config') / 'features' / 'feature_ephemeris.cfg'
+holidays = (Path.cwd() / 'config') / 'features' / 'feature_holidays.cfg'
[PREPROCESSING]
[MODEL]
method = xgboost
+
[HYPERPARAMETERS]
-learning_rate = 0.01,
-max_depth = 10,
-random_state=42,
-n_estimators = 173,
-n_jobs=-1,
-objective = 'count:poisson'
\ No newline at end of file
+learning_rate = 0.01
+max_depth = 7
+random_state = 42
+n_estimators = 1000
+n_jobs = -1
+objective = 'count:poisson'
\ No newline at end of file
from shutil import rmtree
from .source.ephemeris import Ephemeris
+from .source.holidays import Holidays
from .source.meteofrance import MeteoFrance
from .learn.learning import Learning
from .learn.preprocessing import Preprocessing
for date in dated_features:
self._X.setdefault(date,{}).update(dated_features[date])
+ if self._config['FEATURES'].getboolean('holidays'):
+ holidays = Holidays(config_file =
+ eval(self._config['FEATURE_CONFIG']['holidays']))
+
+ holidays.start = self._start
+ holidays.end = self._end
+
+ dated_features = holidays.dated_features
+ for date in dated_features:
+ self._X.setdefault(date,{}).update(dated_features[date])
+
def add_target(self):
self._target = Target(config_file = eval(self._config['TARGET']['config']),
df = X
df['cible'] = y
- print(df.head())
-
train_val_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)
train_set, val_set = train_test_split(train_val_set, test_size = 0.2, random_state = 42)
if self._config['MODEL']['method'] == 'xgboost':
- xgb_reg = xgboost.XGBRegressor(learning_rate = 0.01,
- max_depth = 10,
- random_state=42,
- n_estimators = 173,
- n_jobs=-1,
- objective = 'count:poisson')
+
+ xgb_reg = xgboost.XGBRegressor(learning_rate = self._config['HYPERPARAMETERS'].getfloat('learning_rate'),
+ max_depth = self._config['HYPERPARAMETERS'].getint('max_depth'),
+ random_state = self._config['HYPERPARAMETERS'].getint('random_state'),
+ n_estimators = self._config['HYPERPARAMETERS'].getint('n_estimators'),
+ n_jobs = self._config['HYPERPARAMETERS'].getint('n_jobs'),
+ objective = 'count:poisson')
xgb_reg.fit(X_train, y_train,
eval_set=[(X_val, y_val)],
self._features = set(chain.from_iterable([tuple(u.keys())
for u in [*dict_features.values()]]))
- feature_files = Path.cwd() / 'config' / 'features'
- self._features = {feat : {'numerical': False} for feat in self._features}
- for feature_file in listdir(feature_files):
- if feature_file.endswith('csv'):
- with open(feature_files / feature_file , "r") as f:
- reader = DictReader(f, delimiter=',')
- typed_names = {row['name']: row['type'] for row in reader}
- for feature in self._features:
- if feature.split('_')[0] in typed_names:
- self._features[feature]['type'] = int(typed_names[feature.split('_')[0]])
- elif feature_file.endswith('cfg'):
+ #feature_files = Path.cwd() / 'config' / 'features'
+ self._features = {feat : {'numerical': False, 'categorical': False}
+ for feat in self._features}
+
+ for feature in self._config['FEATURES']:
+ if self._config['FEATURES'][feature]:
+ feature_file = self._config['FEATURE_CONFIG'][feature]
config = ConfigParser()
- config.read(feature_files / feature_file)
+ config.read(feature_file)
for section in config:
if config.has_option(section, 'numerical'):
self._features[section]['numerical'] = config[section].getboolean('numerical')
+ self._features[section]['categorical'] = config[section].getboolean('categorical')
- self._numerical_columns = [k for k in self._features if self._features[k]['type'] == 1
- or (self._features[k]['type'] == 3 and self._features[k]['numerical'])]
-
- self._categorical_columns = [k for k in self._features if self._features[k]['type'] == 2
- or (self._features[k]['type'] == 3 and not self._features[k]['numerical'])]
+ self._numerical_columns = [k for k in self._features if self._features[k]['numerical']]
+ self._categorical_columns = [k for k in self._features if self._features[k]['categorical']]
# Dropping rows that are not related to our datetime window (start/
# step / end)
logger.info("Dropping rows that are not related to our datetime window")
- self._dataframe['datetime'] =\
- self._dataframe.apply(lambda x: datetime(int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)), axis=1)
+ dates = tuple((x.year, x.month, x.day, x.hour) for x in self._datetimes)
self._dataframe['row_ok'] =\
- self._dataframe.apply(lambda x:x.datetime in self._datetimes, axis=1)
+ self._dataframe.apply(lambda x: (int(x.year), int(x.month), int(x.dayInMonth), int(x.hour)) in dates, axis=1)
self._dataframe = self._dataframe[self._dataframe['row_ok']]
- self._dataframe = self._dataframe.drop(['datetime', 'row_ok'], axis=1)
+ self._dataframe = self._dataframe.drop(['row_ok'], axis=1)
logger.info("Rows dropped")
-from .source import Source
-
from configparser import ConfigParser
-from csv import DictReader
from datetime import datetime, timedelta
+from logging import getLogger
+from logging.config import fileConfig
from pathlib import Path
import time
import calendar
-CSV_FILE = Path.cwd() / 'config' / 'features' / 'ephemeris_features.csv'
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
class Ephemeris:
def __init__(self, config_file):
- # Check for the integrity of feature names
- Source.__init__(self)
-
self._config = ConfigParser()
self._config.read(config_file)
# Collecting ephemeris features
- with open(CSV_FILE, "r") as f:
- reader = DictReader(f, delimiter=',')
- self._features = [row['name'] for row in reader
- if self._config['FEATURES'].getboolean(row['name'])]
+ self._features = [section for section in self._config
+ if self._config[section].getboolean('numerical')
+ or self._config[section].getboolean('categorical')]
self._dated_features = {}
self._end = x
-
@property
def dated_features(self):
if self._dated_features == {}:
+ logger.info("Adding ephemeris features")
date = self._start
while date <= self._end:
dict_hour = {}
--- /dev/null
+from configparser import ConfigParser
+from datetime import datetime, timedelta
+from jours_feries_france.compute import JoursFeries
+from logging import getLogger
+from logging.config import fileConfig
+from vacances_scolaires_france import SchoolHolidayDates
+
+import itertools
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+class Holidays:
+
+ _start = None
+ _end = None
+
+ def __init__(self, config_file):
+
+ self._config = ConfigParser()
+ self._config.read(config_file)
+
+ # Collecting holidays features
+ self._features = [section for section in self._config
+ if self._config[section].getboolean('numerical')
+ or self._config[section].getboolean('categorical')]
+
+ self._dated_features = {}
+
+
+ @property
+ def start(self):
+ return self._start
+
+ @start.setter
+ def start(self, x):
+ self._start = x
+
+
+ @property
+ def end(self):
+ return self._end
+
+ @end.setter
+ def end(self, x):
+ self._end = x
+
+
+
+ def _get_academic_zone(self, name, date):
+ dict_zones = {
+ 'Caen' : ('A', 'B'),
+ 'Clermont-Ferrand' : ('A', 'A'),
+ 'Grenoble' : ('A', 'A'),
+ 'Lyon' : ('A', 'A'),
+ 'Montpellier' : ('A', 'C'),
+ 'Nancy-Metz' : ('A', 'B'),
+ 'Nantes' : ('A', 'B'),
+ 'Rennes' : ('A', 'B'),
+ 'Toulouse' : ('A', 'C'),
+ 'Aix-Marseille' : ('B', 'B'),
+ 'Amiens' : ('B', 'B'),
+ 'Besançon' : ('B', 'A'),
+ 'Dijon' : ('B', 'A'),
+ 'Lille' : ('B', 'B'),
+ 'Limoges' : ('B', 'A'),
+ 'Nice' : ('B', 'B'),
+ 'Orléans-Tours' : ('B', 'B'),
+ 'Poitiers' : ('B', 'A'),
+ 'Reims' : ('B', 'B'),
+ 'Rouen ' : ('B', 'B'),
+ 'Strasbourg' : ('B', 'B'),
+ 'Bordeaux' : ('C', 'A'),
+ 'Créteil' : ('C', 'C'),
+ 'Paris' : ('C', 'C'),
+ 'Versailles' : ('C', 'C')
+ }
+ if date < datetime(2016, 1, 1):
+ return dict_zones[name][0]
+ else:
+ return dict_zones[name][1]
+
+
+ @property
+ def dated_features(self):
+ if self._dated_features == {}:
+ logger.info("Adding holidays features")
+ bankHolidays = tuple(itertools.chain.from_iterable(list(JoursFeries.for_year(k).values())
+ for k in range(self.start.year, self.end.year+1)))
+ bankHolidaysEve = tuple(u-timedelta(days=1) for u in bankHolidays)
+ name = self._config['ZONE']['name']
+ date = self._start
+ d = SchoolHolidayDates()
+ while date <= self._end:
+ Date = datetime.date(date)
+ tomorrow = date + timedelta(days=1)
+ Tomorrow = datetime.date(tomorrow)
+ dict_hour = {
+ 'bankHolidays' : Date in bankHolidays,
+ 'bankHolidaysEve': Date in bankHolidaysEve,
+ 'holidays': d.is_holiday_for_zone(Date, self._get_academic_zone(name, date)),
+ 'holidaysEve': d.is_holiday_for_zone(Tomorrow, self._get_academic_zone(name, tomorrow))
+ }
+ self._dated_features[date] = dict_hour
+ date += timedelta(hours=1)
+ return self._dated_features
\ No newline at end of file
-from .source import Source
-
from configparser import ConfigParser
from csv import DictReader
from datetime import datetime
fileConfig((Path.cwd() / 'config') / 'logging.cfg')
logger = getLogger()
-CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
-
-class MeteoFrance(Source):
+class MeteoFrance:
_latitude = None
_longitude = None
def __init__(self, config_file):
'''
Constructor of the MeteoFrance source of feature.
-
- - It will reinitiate the data directory, if asked in the config
- features.cfg file.
- - It searches for the nb_stations meteo stations closest to the provided
- point (longitude and latitude)
-
- For more information about this source of feature, see:
- https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32
-
- Parameters:
- - in config file:
- latitude (float): The latitude from which we want the meteo features.
- longitude (float): The longitude from which we want the meteo features.
- nb_stations (int): Number of closest stations to consider.
- - provided to the constructor
- features (list): Weather features that have to be integrated, according
- to their names in meteofrance_features.csv (cf. config directory)
-
'''
- # Check for the integrity of feature names
- Source.__init__(self)
-
self._config = ConfigParser()
self._config.read(config_file)
self._stations = self._get_stations()
# Collecting meteofrance features
- with open(CSV_FILE, "r") as f:
- reader = DictReader(f, delimiter=',')
- self._features = [row['name'] for row in reader
- if self._config['FEATURES'].getboolean(row['name'])]
+ self._features = [section for section in self._config
+ if self._config.has_option(section, 'numerical')
+ and (self._config[section]['numerical'] or
+ self._config[section]['categorical'])]
+
@property
dict: the dictionary of features per datestamp
'''
if self._dated_features == None:
- logger.info(f'Collecting meteo feature information from {CSV_FILE}')
+ logger.info('Collecting meteofrance feature information')
# A dictionary for the features
- with open(CSV_FILE, "r") as f:
- reader = DictReader(f, delimiter=',')
- dico_features = {row["abbreviation"]:
- {
- 'name': row['name'], # feature name
- 'type': row['type'] # qualitative (2) or quantitative (1)
- }
- for row in reader if row['name'] in self._features}
- #print([row for row in reader])
- #print([row for row in reader if row['name'] in self._features])
+ dico_features = {self._config[section]["abbreviation"]:
+ {
+ 'name': section, # feature name
+ 'numerical': self._config[section]['numerical'],
+ 'categorical': self._config[section]['categorical']
+ }
+ for section in self._features}
dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical'
self._dated_features = {}
- for csv_meteo in listdir(dir_data):
+ for csv_meteo in sorted(listdir(dir_data)):
date = datetime.strptime(csv_meteo.split('.')[1], '%Y%m')
if (date >= self._start and date <= self._end)\
or (date.year == self._start.year and date.month == self._start.month)\
+++ /dev/null
-from configparser import ConfigParser
-from csv import DictReader
-from logging import getLogger
-from logging.config import fileConfig
-from os import listdir
-from pathlib import Path
-
-fileConfig((Path.cwd() / 'config') / 'logging.cfg')
-logger = getLogger()
-
-
-class Source:
- def __init__(self):
- '''
- Check if the same feature name is used in two different feature sources,
- and if the sources of type 3 (being both categorical and numerical) have
- a specified type in the feature_...cfg file
- '''
- logger.info('Check for redondant feature names')
- feature_files = Path.cwd() / 'config' / 'features'
- list_of_names = []
- for file_name in listdir(feature_files ):
- if file_name.endswith('csv'):
- with open(feature_files / file_name, "r") as f:
- reader = DictReader(f, delimiter=',')
- list_of_names.extend([row['name'] for row in reader])
-
- if len(list_of_names) != len(set(list_of_names)):
- raise ValueError("At least two features have the same name")
-
- logger.info('Check for specified feature types')
- names_of_mixed_types = []
- for file_name in listdir(feature_files):
- if file_name.endswith('csv'):
- with open(feature_files / file_name, "r") as f:
- reader = DictReader(f, delimiter=',')
- names_of_mixed_types.extend([row['name'] for row in reader
- if row['type'] == '3'])
-
- cfg_names_of_mixed_types = []
- for file_name in listdir(feature_files):
- if file_name.endswith('cfg'):
- config = ConfigParser()
- config.read(feature_files / file_name)
- for section in config:
- if config.has_option(section, 'numerical'):
- cfg_names_of_mixed_types.append(section)
-
- if sorted(names_of_mixed_types) != sorted(cfg_names_of_mixed_types):
- raise ValueError(f"Problem with features of mixed types: "
- f"{set(names_of_mixed_types).symmetric_difference(cfg_names_of_mixed_types)}")