]> AND Private Git Repository - predictops.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Adding a source module to check for redundancy in feature names.
authorChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
committerChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
main.py
predictops/learn/preprocessing.py
predictops/source/ephemeris.py
predictops/source/meteofrance.py
predictops/source/source.py [new file with mode: 0644]
predictops/target/all.py [new file with mode: 0644]

diff --git a/main.py b/main.py
index 426d3b580fdbed6586b7111ce6ffb17c1172824a..9e35b2da9acf12c39b784454a8c420dae5ce2bb4 100644 (file)
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 from predictops.engine import Engine
 from predictops.learn.preprocessing import Preprocessing
 from predictops.engine import Engine
 from predictops.learn.preprocessing import Preprocessing
+from predictops.target.all import All
 from predictops.target.toarea import ToArea
 
 from logging import getLogger
 from predictops.target.toarea import ToArea
 
 from logging import getLogger
@@ -24,6 +25,11 @@ if __name__ == '__main__':
     #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
     print(process.dataframe.head(n=20))
     print(process.dataframe.tail(n=20))
     #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
     print(process.dataframe.head(n=20))
     print(process.dataframe.tail(n=20))
+
+
+    target = All(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
+
+
     exit()
 
     depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
     exit()
 
     depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..a878a8215d83e8cd504ff7f345cbd1c15165a7e7 100644 (file)
@@ -1,8 +1,10 @@
 from configparser import ConfigParser
 from configparser import ConfigParser
+from csv import DictReader
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
+from os import listdir
 from pathlib import Path
 
 import numpy as np
 from pathlib import Path
 
 import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
+        for csv_file in listdir():
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                dico_features = {{row['name']: row['type']  # qualitative (2) or quantitative (1)
+                                    }
+                                for row in reader if row['name'] in self._features}
+
+        self._features = {feat : None for feat in self._features}
+        print(self._features)
+        exit()
 
 
     @property
 
 
     @property
@@ -134,7 +146,11 @@ class Preprocessing:
             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                 self._dataframe = self._dataframe.interpolate(method='spline',
                                                               order=self._config['PREPROCESSING'].getint('order'))
             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                 self._dataframe = self._dataframe.interpolate(method='spline',
                                                               order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T
index e46b296ad09efd78afdff11163e43f3606ce67fc..d0e4ca06cb14a5a9ec18e17ce626a83498434647 100644 (file)
@@ -15,6 +15,9 @@ class Ephemeris:
 
     def __init__(self, config_file):
 
 
     def __init__(self, config_file):
 
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
         self._config = ConfigParser()
         self._config.read(config_file)
 
         self._config = ConfigParser()
         self._config.read(config_file)
 
index afe18ad82e77efc3e495881fe540ad143f1d1284..3d8ae885157a9d00f49770d252afba275398f4ac 100644 (file)
@@ -1,3 +1,5 @@
+from .source import Source
+
 from configparser import ConfigParser
 from csv import DictReader
 from datetime import datetime
 from configparser import ConfigParser
 from csv import DictReader
 from datetime import datetime
@@ -19,7 +21,7 @@ logger = getLogger()
 CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
 
 
 CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
 
 
-class MeteoFrance:
+class MeteoFrance(Source):
 
     _latitude    = None
     _longitude   = None
 
     _latitude    = None
     _longitude   = None
@@ -50,6 +52,9 @@ class MeteoFrance:
                   to their names in meteofrance_features.csv (cf. config directory)
 
         '''
                   to their names in meteofrance_features.csv (cf. config directory)
 
         '''
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
         self._config = ConfigParser()
         self._config.read(config_file)
 
         self._config = ConfigParser()
         self._config.read(config_file)
 
diff --git a/predictops/source/source.py b/predictops/source/source.py
new file mode 100644 (file)
index 0000000..714ed12
--- /dev/null
@@ -0,0 +1,24 @@
+from csv import DictReader
+from logging import getLogger
+from logging.config import fileConfig
+from os import listdir
+from pathlib import Path
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
+class Source:
+    def __init__(self):
+        '''
+        Check if the same feature name is used in two different feature sources
+        '''
+        logger.info('Check for redondant feature names')
+        csv_files = Path.cwd() / 'config' / 'features'
+        list_of_names = []
+        for csv_file in listdir(csv_files):
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                list_of_names.extend([row['name'] for row in reader])
+        if len(list_of_names) != len(set(list_of_names)):
+            raise ValueError("At least two features have the same name")
\ No newline at end of file
diff --git a/predictops/target/all.py b/predictops/target/all.py
new file mode 100644 (file)
index 0000000..0d9d72b
--- /dev/null
@@ -0,0 +1,18 @@
+from csv import DictReader
+
+class All:
+
+    _start = None
+    _end   = None
+
+    def __init__(self, stream_file = None):
+        self._stream_file = stream_file
+        self._get_located_interventions()
+
+
+    def _get_located_interventions(self):
+        with open(self._stream_file) as f:
+            reader = DictReader(f, delimiter=',')
+            for row in reader:
+                print(row)
+