]> AND Private Git Repository - predictops.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
New version now drived by config files
authorChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sun, 16 Feb 2020 10:45:29 +0000 (11:45 +0100)
committerChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sun, 16 Feb 2020 10:45:29 +0000 (11:45 +0100)
config/feature_ephemeris.cfg [new file with mode: 0644]
config/feature_meteo.cfg [new file with mode: 0644]
config/features.cfg [deleted file]
config/features/meteofrance_features.csv
config/learn.cfg [new file with mode: 0644]
config/main.cfg [deleted file]
main.py
predictops/engine.py [new file with mode: 0644]
predictops/learn/preprocessing.py
predictops/source/ephemeris.py
predictops/source/meteofrance.py

diff --git a/config/feature_ephemeris.cfg b/config/feature_ephemeris.cfg
new file mode 100644 (file)
index 0000000..6b37dcf
--- /dev/null
@@ -0,0 +1,14 @@
+[FEATURES]
+hour       = True
+dayInWeek  = True
+dayInMonth = True
+dayInYear  = True
+weekInYear = True
+month      = True
+year       = True
+
+[HOUR]
+numerical  = True
+
+[YEAR]
+numerical  = True
\ No newline at end of file
diff --git a/config/feature_meteo.cfg b/config/feature_meteo.cfg
new file mode 100644 (file)
index 0000000..02bdab7
--- /dev/null
@@ -0,0 +1,25 @@
+[GENERAL]
+regenerate = False
+reinsert   = True
+
+[POSITION]
+latitude  = 47.25
+longitude = 6.0333
+
+[STATIONS]
+nb_stations = 3
+
+[FEATURES]
+temperature            = True
+pressure               = True
+pressureVariation      = False
+barometricTrend        = False
+humidity               = False
+dewPoint               = False
+lastHourRainfall       = False
+last3hHourRainfall     = False
+meanWindSpeed10min     = False
+meanWindDirection10min = False
+gustsOverAPeriod       = False
+horizontalVisibility   = False
+currentWeather         = False
\ No newline at end of file
diff --git a/config/features.cfg b/config/features.cfg
deleted file mode 100644 (file)
index 927fb92..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-[meteofrance]
-regenerate = False
-reinsert   = True
index f3303ea812b964c1dbfc51821f0937ad9789b9a7..0253d1ccbd6472d8a057709aeef42b7a38436bcf 100644 (file)
@@ -2,7 +2,7 @@ abbreviation,name,unit,type,type
 t,temperature,K,real,1
 pres,pressure,Pa,integer,1
 tend,pressureVariation,Pa,integer,1
 t,temperature,K,real,1
 pres,pressure,Pa,integer,1
 tend,pressureVariation,Pa,integer,1
-cod_tend,BarometricTrend,code,integer,2
+cod_tend,barometricTrend,code,integer,2
 u,humidity,%,integer,1
 td,dewPoint,K,real,1
 rr1,lastHourRainfall,mm,real,1
 u,humidity,%,integer,1
 td,dewPoint,K,real,1
 rr1,lastHourRainfall,mm,real,1
diff --git a/config/learn.cfg b/config/learn.cfg
new file mode 100644 (file)
index 0000000..bbd3557
--- /dev/null
@@ -0,0 +1,19 @@
+[DATETIME]
+start    = 01/01/2010 01:00:00
+end      = 12/31/2010 23:00:00
+hourStep = 6
+
+
+[FEATURES]
+meteofrance = True
+ephemeris   = True
+
+
+[FEATURE_CONFIG]
+meteofrance = (Path.cwd() / 'config') / 'feature_meteo.cfg'
+ephemeris   = (Path.cwd() / 'config') / 'feature_ephemeris.cfg'
+
+
+[PREPROCESSING]
+fill_method = spline
+order       = 3
\ No newline at end of file
diff --git a/config/main.cfg b/config/main.cfg
deleted file mode 100644 (file)
index 942ef96..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-[postgresql]
-host   = localhost
-user   = christophe
-port   = 5432
-dbname = extome
diff --git a/main.py b/main.py
index a42ce0f11cb9b42feae17fa17e1abcdf513b8570..426d3b580fdbed6586b7111ce6ffb17c1172824a 100644 (file)
--- a/main.py
+++ b/main.py
@@ -1,89 +1,35 @@
-from predictops.source.ephemeris import Ephemeris
-from predictops.source.meteofrance import MeteoFrance
+from predictops.engine import Engine
 from predictops.learn.preprocessing import Preprocessing
 from predictops.target.toarea import ToArea
 
 from predictops.learn.preprocessing import Preprocessing
 from predictops.target.toarea import ToArea
 
-from datetime import datetime, timedelta
 from logging import getLogger
 from logging.config import fileConfig
 from pathlib import Path
 from logging import getLogger
 from logging.config import fileConfig
 from pathlib import Path
-from shutil import rmtree
 
 import geopandas as gpd
 
 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
 logger = getLogger()
 
 
 import geopandas as gpd
 
 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
 logger = getLogger()
 
+if __name__ == '__main__':
 
 
-class Engine:
-    def __init__(self, start = None, end = None, time_step = None):
-        self._X = {}
-        self._Y = {}
+    config = (Path.cwd() / 'config') / 'learn.cfg'
+    engine = Engine(config_file = config)
 
 
+    engine.add_features()
+    #print(engine.X)
+    process = Preprocessing(config_file = config, dict_features = engine.X)
 
 
-    def clean(self):
-        # Cleaning the data directory
-        logger.info("Cleaning and restoring data directory")
-        directory  = Path.cwd() / 'data'
-        if directory.is_dir():
-            rmtree(directory)
-        p = Path(Path.cwd() / 'data')
-        p.mkdir()
 
 
+    #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+    print(process.dataframe.head(n=20))
+    print(process.dataframe.tail(n=20))
+    exit()
 
 
-    def add_feature(self, name, **kw):
+    depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
+    Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0]
 
 
-        if name == 'meteofrance':
-            meteofeature = MeteoFrance(**kw)
-            meteofeature.update()
-            dated_features = meteofeature.dated_features
-            for date in dated_features:
-                self._X.setdefault(date,{}).update(dated_features[date])
-        elif name == 'ephemeris':
-            ephemerides = Ephemeris(**kw)
-            dated_features = ephemerides.dated_features
-            for date in dated_features:
-                self._X.setdefault(date,{}).update(dated_features[date])
-
-
-    @property
-    def X(self):
-        return self._X
-
-    @X.setter
-    def X(self, x):
-        self._X = x
-
-
-start = datetime.strptime('01/01/2010 01:00:00', '%m/%d/%Y %H:%M:%S')
-end = datetime.strptime('12/31/2010 23:00:00', '%m/%d/%Y %H:%M:%S')
-
-engine = Engine()
-engine.add_feature(name = 'meteofrance',
-                   start = start, end = end,
-                   latitude = 47.25, longitude = 6.0333, nb_stations = 3,
-                   features = ['temperature', 'pressure'])
-
-
-engine.add_feature(name = 'ephemeris',
-                   start = start, end = end,
-                   features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear',
-                               'weekInYear', 'month', 'year'])
-
-
-process = Preprocessing(dict_features = engine.X,
-                   start = start, end = end, timestep = timedelta(hours=6))
-
-
-df = process.dataframe.head(n=20)
-#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
-print(df)
-exit()
-
-depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
-Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0]
-
-ToArea(area=Doubs.geometry,
-       start = start, end = end,
-       csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
+    ToArea(area=Doubs.geometry,
+           start = start, end = end,
+           csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
 
 
diff --git a/predictops/engine.py b/predictops/engine.py
new file mode 100644 (file)
index 0000000..2ec62df
--- /dev/null
@@ -0,0 +1,74 @@
+from configparser import ConfigParser
+from datetime import datetime, timedelta
+from logging import getLogger
+from logging.config import fileConfig
+from pathlib import Path
+from shutil import rmtree
+
+from predictops.source.ephemeris import Ephemeris
+from predictops.source.meteofrance import MeteoFrance
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
+class Engine:
+
+    def __init__(self, config_file = (Path.cwd() / 'config') / 'learn.cfg'):
+        self._config = ConfigParser()
+        self._config.read(config_file)
+        self._start = datetime.strptime(self._config['DATETIME']['start'],
+                                        '%m/%d/%Y %H:%M:%S')
+        self._end = datetime.strptime(self._config['DATETIME']['end'],
+                                        '%m/%d/%Y %H:%M:%S')
+
+        self._timestep = timedelta(hours =
+                                   self._config['DATETIME'].getfloat('hourStep'))
+
+        self._X = {}
+        self._Y = {}
+
+
+
+    def clean(self):
+        # Cleaning the data directory
+        logger.info("Cleaning and restoring data directory")
+        directory  = Path.cwd() / 'data'
+        if directory.is_dir():
+            rmtree(directory)
+        p = Path(Path.cwd() / 'data')
+        p.mkdir()
+
+
+    def add_features(self):
+        if self._config['FEATURES'].getboolean('meteofrance'):
+            meteofeature = MeteoFrance(config_file =
+                                       eval(self._config['FEATURE_CONFIG']['meteofrance']))
+
+            meteofeature.start = self._start
+            meteofeature.end = self._end
+
+            meteofeature.update()
+            dated_features = meteofeature.dated_features
+            for date in dated_features:
+                self._X.setdefault(date,{}).update(dated_features[date])
+
+        if self._config['FEATURES'].getboolean('ephemeris'):
+            ephemerides = Ephemeris(config_file =
+                                    eval(self._config['FEATURE_CONFIG']['ephemeris']))
+
+            ephemerides.start = self._start
+            ephemerides.end = self._end
+
+            dated_features = ephemerides.dated_features
+            for date in dated_features:
+                self._X.setdefault(date,{}).update(dated_features[date])
+
+
+    @property
+    def X(self):
+        return self._X
+
+    @X.setter
+    def X(self, x):
+        self._X = x
\ No newline at end of file
index 833e48316bffa1c51affc6885210b71f61b2c1d1..5400d1d39f1135ce5e2abcfec2541201cf5d8ed6 100644 (file)
@@ -1,3 +1,5 @@
+from configparser import ConfigParser
+from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
@@ -18,19 +20,22 @@ class Preprocessing:
      - Missing datetimes are added first with np.NaN feature values,
      - The dataframe is then constructed based on the filled feature dictionary,
      - NaN values are then filled with last known values.
      - Missing datetimes are added first with np.NaN feature values,
      - The dataframe is then constructed based on the filled feature dictionary,
      - NaN values are then filled with last known values.
-
     '''
     '''
-    def __init__(self, dict_features,
-                 start, end, timestep,
-                 features = None):
+
+    def __init__(self, config_file = None, dict_features = None, features = None):
         '''
         Constructor that defines all needed attributes and collects features.
         '''
         '''
         Constructor that defines all needed attributes and collects features.
         '''
-        logger.info("Entering  NaN values in the feature dataframe")
+        self._config = ConfigParser()
+        self._config.read(config_file)
+
+        self._start = datetime.strptime(self._config['DATETIME']['start'],
+                                        '%m/%d/%Y %H:%M:%S')
+        self._end = datetime.strptime(self._config['DATETIME']['end'],
+                                        '%m/%d/%Y %H:%M:%S')
+        self._timestep = timedelta(hours =
+                                   self._config['DATETIME'].getfloat('hourStep'))
         self._dict_features = dict_features
         self._dict_features = dict_features
-        self._start = start
-        self._end = end
-        self._timestep = timestep
         self._full_dict = None
         self._dataframe = None
         self._datetimes = []
         self._full_dict = None
         self._dataframe = None
         self._datetimes = []
@@ -43,6 +48,33 @@ class Preprocessing:
                                                       for u in [*dict_features.values()]]))
 
 
                                                       for u in [*dict_features.values()]]))
 
 
+    @property
+    def start(self):
+        return self._start
+
+    @start.setter
+    def start(self, x):
+        self._start = x
+
+
+    @property
+    def end(self):
+        return self._end
+
+    @end.setter
+    def end(self, x):
+        self._end = x
+
+
+    @property
+    def timestep(self):
+        return self._timestep
+
+    @timestep.setter
+    def timestep(self, x):
+        self._timestep = x
+
+
     def _fill_dict(self):
         '''
         Add datetime keys in the dated feature dictionary that are missing. The
     def _fill_dict(self):
         '''
         Add datetime keys in the dated feature dictionary that are missing. The
@@ -94,14 +126,22 @@ class Preprocessing:
             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                      orient='index')
             logger.info("Filling NaN values in the feature dataframe")
             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
                                                      orient='index')
             logger.info("Filling NaN values in the feature dataframe")
-            #TODO: add other filling methods like linear interpolation
-            self._dataframe = self._dataframe.fillna(method='ffill')
+
+            if self._config['PREPROCESSING']['fill_method'] == 'propagate':
+                self._dataframe = self._dataframe.fillna(method='ffill')
+            elif self._config['PREPROCESSING']['fill_method'] == 'linear':
+                self._dataframe = self._dataframe.interpolate()
+            elif self._config['PREPROCESSING']['fill_method'] == 'spline':
+                self._dataframe = self._dataframe.interpolate(method='spline',
+                                                              order=self._config['PREPROCESSING'].getint('order'))
             self._dataframe = self._dataframe.fillna(method='bfill')
             self._dataframe = self._dataframe.fillna(method='bfill')
+
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T
                                                    if k not in self._datetimes])
         return self._dataframe
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T
                                                    if k not in self._datetimes])
         return self._dataframe
 
+
     @dataframe.setter
     def dataframe(self, df):
         self._dataframe = df
     @dataframe.setter
     def dataframe(self, df):
         self._dataframe = df
index 33c0f2d31f2ff26beb7dcced68cc08033c495ea1..e46b296ad09efd78afdff11163e43f3606ce67fc 100644 (file)
@@ -1,21 +1,48 @@
+from configparser import ConfigParser
+from csv import DictReader
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
+from pathlib import Path
+
 import time
 import calendar
 
 import time
 import calendar
 
+CSV_FILE = Path.cwd() / 'config' / 'features' / 'ephemeris_features.csv'
+
 class Ephemeris:
 
 class Ephemeris:
 
-    def __init__(self, start = time.strptime('19960101000000', '%Y%m%d%H%M%S'),
-                 end = datetime.now(), features = []):
-        self._start = start
-        self._end = end
-        self._features = features
+    _start = None
+    _end   = None
+
+    def __init__(self, config_file):
+
+        self._config = ConfigParser()
+        self._config.read(config_file)
+
+        # Collecting ephemeris features
+        with open(CSV_FILE, "r") as f:
+            reader = DictReader(f, delimiter=',')
+            self._features = [row['name'] for row in reader
+                              if self._config['FEATURES'].getboolean(row['name'])]
 
         self._dated_features = {}
 
 
 
         self._dated_features = {}
 
 
+    @property
+    def start(self):
+        return self._start
+
+    @start.setter
+    def start(self, x):
+        self._start = x
+
+
+    @property
+    def end(self):
+        return self._end
 
 
-    def update(self):
-        pass
+    @end.setter
+    def end(self, x):
+        self._end = x
 
 
 
 
 
 
index 5a885ee2cac64915bd54359699c35ed7d5365d14..afe18ad82e77efc3e495881fe540ad143f1d1284 100644 (file)
@@ -16,12 +16,19 @@ import gzip
 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
 logger = getLogger()
 
 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
 logger = getLogger()
 
+CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
+
+
 class MeteoFrance:
 
 class MeteoFrance:
 
-    def __init__(self, latitude = 47.25, longitude = 6.0333, nb_stations = 3,
-                 start = datetime.strptime('19960101000000', '%Y%m%d%H%M%S'),
-                 end = datetime.now(),
-                 features = []):
+    _latitude    = None
+    _longitude   = None
+    _nb_stations = None
+    _start       = None
+    _end         = None
+    _features    = None
+
+    def __init__(self, config_file):
         '''
         Constructor of the MeteoFrance source of feature.
 
         '''
         Constructor of the MeteoFrance source of feature.
 
@@ -34,33 +41,80 @@ class MeteoFrance:
     https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32
 
         Parameters:
     https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32
 
         Parameters:
+          - in config file:
             latitude (float): The latitude from which we want the meteo features.
             longitude (float): The longitude from which we want the meteo features.
             nb_stations (int): Number of closest stations to consider.
             latitude (float): The latitude from which we want the meteo features.
             longitude (float): The longitude from which we want the meteo features.
             nb_stations (int): Number of closest stations to consider.
+          - provided to the constructor
             features (list): Weather features that have to be integrated, according
                   to their names in meteofrance_features.csv (cf. config directory)
 
         '''
             features (list): Weather features that have to be integrated, according
                   to their names in meteofrance_features.csv (cf. config directory)
 
         '''
-        self._latitude = latitude
-        self._longitude = longitude
-        self._nb_stations = nb_stations
-        self._start = start
-        self._end = end
-        self._features = features
+        self._config = ConfigParser()
+        self._config.read(config_file)
 
         self._data_directory = (Path.cwd() / 'data') / 'features' / 'meteo_france'
 
         self._dated_features = None
 
         # Re-creating data directory architecture for MeteoFrance, if asked
 
         self._data_directory = (Path.cwd() / 'data') / 'features' / 'meteo_france'
 
         self._dated_features = None
 
         # Re-creating data directory architecture for MeteoFrance, if asked
-        config = ConfigParser()
-        config.read((Path.cwd() / 'config') / 'features.cfg')
-        if eval(config['meteofrance']['regenerate']):
+        if self._config['GENERAL'].getboolean('regenerate'):
             self._regenerate_directory()
 
         # Collecting the closest meteo station
             self._regenerate_directory()
 
         # Collecting the closest meteo station
+        self._nb_stations = self._config['STATIONS'].getint('nb_stations')
         self._stations = self._get_stations()
 
         self._stations = self._get_stations()
 
+        # Collecting meteofrance features
+        with open(CSV_FILE, "r") as f:
+            reader = DictReader(f, delimiter=',')
+            self._features = [row['name'] for row in reader
+                              if self._config['FEATURES'].getboolean(row['name'])]
+
+
+    @property
+    def start(self):
+        return self._start
+
+    @start.setter
+    def start(self, x):
+        self._start = x
+
+
+    @property
+    def end(self):
+        return self._end
+
+    @end.setter
+    def end(self, x):
+        self._end = x
+
+
+    @property
+    def latitude(self):
+        return self._latitude
+
+    @latitude.setter
+    def latitude(self, x):
+        self._latitude = x
+
+
+    @property
+    def longitude(self):
+        return self._longitude
+
+    @longitude.setter
+    def longitude(self, x):
+        self._longitude = x
+
+
+    @property
+    def nb_stations(self):
+        return self._nb_stations
+
+    @nb_stations.setter
+    def nb_stations(self, x):
+        self._nb_stations = x
 
 
     def _regenerate_directory(self):
 
 
     def _regenerate_directory(self):
@@ -193,10 +247,9 @@ class MeteoFrance:
             dict: the dictionary of features per datestamp
         '''
         if self._dated_features == None:
             dict: the dictionary of features per datestamp
         '''
         if self._dated_features == None:
-            csv_file = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
-            logger.info(f'Collecting meteo feature information from {csv_file}')
+            logger.info(f'Collecting meteo feature information from {CSV_FILE}')
             # A dictionary for the features
             # A dictionary for the features
-            with open(csv_file, "r") as f:
+            with open(CSV_FILE, "r") as f:
                 reader = DictReader(f, delimiter=',')
                 dico_features = {row["abbreviation"]:
                                    {
                 reader = DictReader(f, delimiter=',')
                 dico_features = {row["abbreviation"]:
                                    {
@@ -204,6 +257,8 @@ class MeteoFrance:
                                        'type': row['type']  # qualitative (2) or quantitative (1)
                                     }
                                 for row in reader if row['name'] in self._features}
                                        'type': row['type']  # qualitative (2) or quantitative (1)
                                     }
                                 for row in reader if row['name'] in self._features}
+                #print([row for row in reader])
+                #print([row for row in reader if row['name'] in self._features])
             dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical'
             self._dated_features = {}
             for csv_meteo in listdir(dir_data):
             dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical'
             self._dated_features = {}
             for csv_meteo in listdir(dir_data):