From: Christophe Guyeux Date: Sun, 16 Feb 2020 10:45:29 +0000 (+0100) Subject: New version now drived by config files X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/66b4627c14e9f89a2e5ab73bbf48819f8f3a1455 New version now drived by config files --- diff --git a/config/feature_ephemeris.cfg b/config/feature_ephemeris.cfg new file mode 100644 index 0000000..6b37dcf --- /dev/null +++ b/config/feature_ephemeris.cfg @@ -0,0 +1,14 @@ +[FEATURES] +hour = True +dayInWeek = True +dayInMonth = True +dayInYear = True +weekInYear = True +month = True +year = True + +[HOUR] +numerical = True + +[YEAR] +numerical = True \ No newline at end of file diff --git a/config/feature_meteo.cfg b/config/feature_meteo.cfg new file mode 100644 index 0000000..02bdab7 --- /dev/null +++ b/config/feature_meteo.cfg @@ -0,0 +1,25 @@ +[GENERAL] +regenerate = False +reinsert = True + +[POSITION] +latitude = 47.25 +longitude = 6.0333 + +[STATIONS] +nb_stations = 3 + +[FEATURES] +temperature = True +pressure = True +pressureVariation = False +barometricTrend = False +humidity = False +dewPoint = False +lastHourRainfall = False +last3hHourRainfall = False +meanWindSpeed10min = False +meanWindDirection10min = False +gustsOverAPeriod = False +horizontalVisibility = False +currentWeather = False \ No newline at end of file diff --git a/config/features.cfg b/config/features.cfg deleted file mode 100644 index 927fb92..0000000 --- a/config/features.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[meteofrance] -regenerate = False -reinsert = True diff --git a/config/features/meteofrance_features.csv b/config/features/meteofrance_features.csv index f3303ea..0253d1c 100644 --- a/config/features/meteofrance_features.csv +++ b/config/features/meteofrance_features.csv @@ -2,7 +2,7 @@ abbreviation,name,unit,type,type t,temperature,K,real,1 pres,pressure,Pa,integer,1 tend,pressureVariation,Pa,integer,1 -cod_tend,BarometricTrend,code,integer,2 +cod_tend,barometricTrend,code,integer,2 u,humidity,%,integer,1 td,dewPoint,K,real,1 rr1,lastHourRainfall,mm,real,1 diff --git a/config/learn.cfg b/config/learn.cfg new file mode 100644 index 0000000..bbd3557 --- /dev/null +++ b/config/learn.cfg @@ -0,0 +1,19 @@ +[DATETIME] +start = 01/01/2010 01:00:00 +end = 12/31/2010 23:00:00 +hourStep = 6 + + +[FEATURES] +meteofrance = True +ephemeris = True + + +[FEATURE_CONFIG] +meteofrance = (Path.cwd() / 'config') / 'feature_meteo.cfg' +ephemeris = (Path.cwd() / 'config') / 'feature_ephemeris.cfg' + + +[PREPROCESSING] +fill_method = spline +order = 3 \ No newline at end of file diff --git a/config/main.cfg b/config/main.cfg deleted file mode 100644 index 942ef96..0000000 --- a/config/main.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[postgresql] -host = localhost -user = christophe -port = 5432 -dbname = extome diff --git a/main.py b/main.py index a42ce0f..426d3b5 100644 --- a/main.py +++ b/main.py @@ -1,89 +1,35 @@ -from predictops.source.ephemeris import Ephemeris -from predictops.source.meteofrance import MeteoFrance +from predictops.engine import Engine from predictops.learn.preprocessing import Preprocessing from predictops.target.toarea import ToArea -from datetime import datetime, timedelta from logging import getLogger from logging.config import fileConfig from pathlib import Path -from shutil import rmtree import geopandas as gpd fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() +if __name__ == '__main__': -class Engine: - def __init__(self, start = None, end = None, time_step = None): - self._X = {} - self._Y = {} + config = (Path.cwd() / 'config') / 'learn.cfg' + engine = Engine(config_file = config) + engine.add_features() + #print(engine.X) + process = Preprocessing(config_file = config, dict_features = engine.X) - def clean(self): - # Cleaning the data directory - logger.info("Cleaning and restoring data directory") - directory = Path.cwd() / 'data' - if directory.is_dir(): - rmtree(directory) - p = Path(Path.cwd() / 'data') - p.mkdir() + #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) + print(process.dataframe.head(n=20)) + print(process.dataframe.tail(n=20)) + exit() - def add_feature(self, name, **kw): + depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp") + Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0] - if name == 'meteofrance': - meteofeature = MeteoFrance(**kw) - meteofeature.update() - dated_features = meteofeature.dated_features - for date in dated_features: - self._X.setdefault(date,{}).update(dated_features[date]) - elif name == 'ephemeris': - ephemerides = Ephemeris(**kw) - dated_features = ephemerides.dated_features - for date in dated_features: - self._X.setdefault(date,{}).update(dated_features[date]) - - - @property - def X(self): - return self._X - - @X.setter - def X(self, x): - self._X = x - - -start = datetime.strptime('01/01/2010 01:00:00', '%m/%d/%Y %H:%M:%S') -end = datetime.strptime('12/31/2010 23:00:00', '%m/%d/%Y %H:%M:%S') - -engine = Engine() -engine.add_feature(name = 'meteofrance', - start = start, end = end, - latitude = 47.25, longitude = 6.0333, nb_stations = 3, - features = ['temperature', 'pressure']) - - -engine.add_feature(name = 'ephemeris', - start = start, end = end, - features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear', - 'weekInYear', 'month', 'year']) - - -process = Preprocessing(dict_features = engine.X, - start = start, end = end, timestep = timedelta(hours=6)) - - -df = process.dataframe.head(n=20) -#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) -print(df) -exit() - -depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp") -Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0] - -ToArea(area=Doubs.geometry, - start = start, end = end, - csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv') + ToArea(area=Doubs.geometry, + start = start, end = end, + csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv') diff --git a/predictops/engine.py b/predictops/engine.py new file mode 100644 index 0000000..2ec62df --- /dev/null +++ b/predictops/engine.py @@ -0,0 +1,74 @@ +from configparser import ConfigParser +from datetime import datetime, timedelta +from logging import getLogger +from logging.config import fileConfig +from pathlib import Path +from shutil import rmtree + +from predictops.source.ephemeris import Ephemeris +from predictops.source.meteofrance import MeteoFrance + +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + + +class Engine: + + def __init__(self, config_file = (Path.cwd() / 'config') / 'learn.cfg'): + self._config = ConfigParser() + self._config.read(config_file) + self._start = datetime.strptime(self._config['DATETIME']['start'], + '%m/%d/%Y %H:%M:%S') + self._end = datetime.strptime(self._config['DATETIME']['end'], + '%m/%d/%Y %H:%M:%S') + + self._timestep = timedelta(hours = + self._config['DATETIME'].getfloat('hourStep')) + + self._X = {} + self._Y = {} + + + + def clean(self): + # Cleaning the data directory + logger.info("Cleaning and restoring data directory") + directory = Path.cwd() / 'data' + if directory.is_dir(): + rmtree(directory) + p = Path(Path.cwd() / 'data') + p.mkdir() + + + def add_features(self): + if self._config['FEATURES'].getboolean('meteofrance'): + meteofeature = MeteoFrance(config_file = + eval(self._config['FEATURE_CONFIG']['meteofrance'])) + + meteofeature.start = self._start + meteofeature.end = self._end + + meteofeature.update() + dated_features = meteofeature.dated_features + for date in dated_features: + self._X.setdefault(date,{}).update(dated_features[date]) + + if self._config['FEATURES'].getboolean('ephemeris'): + ephemerides = Ephemeris(config_file = + eval(self._config['FEATURE_CONFIG']['ephemeris'])) + + ephemerides.start = self._start + ephemerides.end = self._end + + dated_features = ephemerides.dated_features + for date in dated_features: + self._X.setdefault(date,{}).update(dated_features[date]) + + + @property + def X(self): + return self._X + + @X.setter + def X(self, x): + self._X = x \ No newline at end of file diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index 833e483..5400d1d 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -1,3 +1,5 @@ +from configparser import ConfigParser +from datetime import datetime, timedelta from itertools import chain from logging import getLogger from logging.config import fileConfig @@ -18,19 +20,22 @@ class Preprocessing: - Missing datetimes are added first with np.NaN feature values, - The dataframe is then constructed based on the filled feature dictionary, - NaN values are then filled with last known values. - ''' - def __init__(self, dict_features, - start, end, timestep, - features = None): + + def __init__(self, config_file = None, dict_features = None, features = None): ''' Constructor that defines all needed attributes and collects features. ''' - logger.info("Entering NaN values in the feature dataframe") + self._config = ConfigParser() + self._config.read(config_file) + + self._start = datetime.strptime(self._config['DATETIME']['start'], + '%m/%d/%Y %H:%M:%S') + self._end = datetime.strptime(self._config['DATETIME']['end'], + '%m/%d/%Y %H:%M:%S') + self._timestep = timedelta(hours = + self._config['DATETIME'].getfloat('hourStep')) self._dict_features = dict_features - self._start = start - self._end = end - self._timestep = timestep self._full_dict = None self._dataframe = None self._datetimes = [] @@ -43,6 +48,33 @@ class Preprocessing: for u in [*dict_features.values()]])) + @property + def start(self): + return self._start + + @start.setter + def start(self, x): + self._start = x + + + @property + def end(self): + return self._end + + @end.setter + def end(self, x): + self._end = x + + + @property + def timestep(self): + return self._timestep + + @timestep.setter + def timestep(self, x): + self._timestep = x + + def _fill_dict(self): ''' Add datetime keys in the dated feature dictionary that are missing. The @@ -94,14 +126,22 @@ class Preprocessing: self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index') logger.info("Filling NaN values in the feature dataframe") - #TODO: add other filling methods like linear interpolation - self._dataframe = self._dataframe.fillna(method='ffill') + + if self._config['PREPROCESSING']['fill_method'] == 'propagate': + self._dataframe = self._dataframe.fillna(method='ffill') + elif self._config['PREPROCESSING']['fill_method'] == 'linear': + self._dataframe = self._dataframe.interpolate() + elif self._config['PREPROCESSING']['fill_method'] == 'spline': + self._dataframe = self._dataframe.interpolate(method='spline', + order=self._config['PREPROCESSING'].getint('order')) self._dataframe = self._dataframe.fillna(method='bfill') + self._dataframe = self._dataframe.drop([k.to_pydatetime() for k in self._dataframe.T if k not in self._datetimes]) return self._dataframe + @dataframe.setter def dataframe(self, df): self._dataframe = df diff --git a/predictops/source/ephemeris.py b/predictops/source/ephemeris.py index 33c0f2d..e46b296 100644 --- a/predictops/source/ephemeris.py +++ b/predictops/source/ephemeris.py @@ -1,21 +1,48 @@ +from configparser import ConfigParser +from csv import DictReader from datetime import datetime, timedelta +from pathlib import Path + import time import calendar +CSV_FILE = Path.cwd() / 'config' / 'features' / 'ephemeris_features.csv' + class Ephemeris: - def __init__(self, start = time.strptime('19960101000000', '%Y%m%d%H%M%S'), - end = datetime.now(), features = []): - self._start = start - self._end = end - self._features = features + _start = None + _end = None + + def __init__(self, config_file): + + self._config = ConfigParser() + self._config.read(config_file) + + # Collecting ephemeris features + with open(CSV_FILE, "r") as f: + reader = DictReader(f, delimiter=',') + self._features = [row['name'] for row in reader + if self._config['FEATURES'].getboolean(row['name'])] self._dated_features = {} + @property + def start(self): + return self._start + + @start.setter + def start(self, x): + self._start = x + + + @property + def end(self): + return self._end - def update(self): - pass + @end.setter + def end(self, x): + self._end = x diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py index 5a885ee..afe18ad 100644 --- a/predictops/source/meteofrance.py +++ b/predictops/source/meteofrance.py @@ -16,12 +16,19 @@ import gzip fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() +CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv' + + class MeteoFrance: - def __init__(self, latitude = 47.25, longitude = 6.0333, nb_stations = 3, - start = datetime.strptime('19960101000000', '%Y%m%d%H%M%S'), - end = datetime.now(), - features = []): + _latitude = None + _longitude = None + _nb_stations = None + _start = None + _end = None + _features = None + + def __init__(self, config_file): ''' Constructor of the MeteoFrance source of feature. @@ -34,33 +41,80 @@ class MeteoFrance: https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32 Parameters: + - in config file: latitude (float): The latitude from which we want the meteo features. longitude (float): The longitude from which we want the meteo features. nb_stations (int): Number of closest stations to consider. + - provided to the constructor features (list): Weather features that have to be integrated, according to their names in meteofrance_features.csv (cf. config directory) ''' - self._latitude = latitude - self._longitude = longitude - self._nb_stations = nb_stations - self._start = start - self._end = end - self._features = features + self._config = ConfigParser() + self._config.read(config_file) self._data_directory = (Path.cwd() / 'data') / 'features' / 'meteo_france' self._dated_features = None # Re-creating data directory architecture for MeteoFrance, if asked - config = ConfigParser() - config.read((Path.cwd() / 'config') / 'features.cfg') - if eval(config['meteofrance']['regenerate']): + if self._config['GENERAL'].getboolean('regenerate'): self._regenerate_directory() # Collecting the closest meteo station + self._nb_stations = self._config['STATIONS'].getint('nb_stations') self._stations = self._get_stations() + # Collecting meteofrance features + with open(CSV_FILE, "r") as f: + reader = DictReader(f, delimiter=',') + self._features = [row['name'] for row in reader + if self._config['FEATURES'].getboolean(row['name'])] + + + @property + def start(self): + return self._start + + @start.setter + def start(self, x): + self._start = x + + + @property + def end(self): + return self._end + + @end.setter + def end(self, x): + self._end = x + + + @property + def latitude(self): + return self._latitude + + @latitude.setter + def latitude(self, x): + self._latitude = x + + + @property + def longitude(self): + return self._longitude + + @longitude.setter + def longitude(self, x): + self._longitude = x + + + @property + def nb_stations(self): + return self._nb_stations + + @nb_stations.setter + def nb_stations(self, x): + self._nb_stations = x def _regenerate_directory(self): @@ -193,10 +247,9 @@ class MeteoFrance: dict: the dictionary of features per datestamp ''' if self._dated_features == None: - csv_file = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv' - logger.info(f'Collecting meteo feature information from {csv_file}') + logger.info(f'Collecting meteo feature information from {CSV_FILE}') # A dictionary for the features - with open(csv_file, "r") as f: + with open(CSV_FILE, "r") as f: reader = DictReader(f, delimiter=',') dico_features = {row["abbreviation"]: { @@ -204,6 +257,8 @@ class MeteoFrance: 'type': row['type'] # qualitative (2) or quantitative (1) } for row in reader if row['name'] in self._features} + #print([row for row in reader]) + #print([row for row in reader if row['name'] in self._features]) dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical' self._dated_features = {} for csv_meteo in listdir(dir_data):