From: Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Date: Mon, 17 Feb 2020 11:07:41 +0000 (+0100)
Subject: Adding a source module to check for redundancy in feature names.
X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/2c5695839a5064f584ffeaba557020ab3270b7b9

Adding a source module to check for redundancy in feature names.
---

diff --git a/main.py b/main.py
index 426d3b5..9e35b2d 100644
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 from predictops.engine import Engine
 from predictops.learn.preprocessing import Preprocessing
+from predictops.target.all import All
 from predictops.target.toarea import ToArea
 
 from logging import getLogger
@@ -24,6 +25,11 @@ if __name__ == '__main__':
     #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
     print(process.dataframe.head(n=20))
     print(process.dataframe.tail(n=20))
+
+
+    target = All(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
+
+
     exit()
 
     depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py
index 5400d1d..a878a82 100644
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,8 +1,10 @@
 from configparser import ConfigParser
+from csv import DictReader
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
+from os import listdir
 from pathlib import Path
 
 import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
+        for csv_file in listdir():
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                dico_features = {{row['name']: row['type']  # qualitative (2) or quantitative (1)
+                                    }
+                                for row in reader if row['name'] in self._features}
+
+        self._features = {feat : None for feat in self._features}
+        print(self._features)
+        exit()
 
 
     @property
@@ -134,7 +146,11 @@ class Preprocessing:
             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                 self._dataframe = self._dataframe.interpolate(method='spline',
                                                               order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T
diff --git a/predictops/source/ephemeris.py b/predictops/source/ephemeris.py
index e46b296..d0e4ca0 100644
--- a/predictops/source/ephemeris.py
+++ b/predictops/source/ephemeris.py
@@ -15,6 +15,9 @@ class Ephemeris:
 
     def __init__(self, config_file):
 
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
         self._config = ConfigParser()
         self._config.read(config_file)
 
diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py
index afe18ad..3d8ae88 100644
--- a/predictops/source/meteofrance.py
+++ b/predictops/source/meteofrance.py
@@ -1,3 +1,5 @@
+from .source import Source
+
 from configparser import ConfigParser
 from csv import DictReader
 from datetime import datetime
@@ -19,7 +21,7 @@ logger = getLogger()
 CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
 
 
-class MeteoFrance:
+class MeteoFrance(Source):
 
     _latitude    = None
     _longitude   = None
@@ -50,6 +52,9 @@ class MeteoFrance:
                   to their names in meteofrance_features.csv (cf. config directory)
 
         '''
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
         self._config = ConfigParser()
         self._config.read(config_file)
 
diff --git a/predictops/source/source.py b/predictops/source/source.py
new file mode 100644
index 0000000..714ed12
--- /dev/null
+++ b/predictops/source/source.py
@@ -0,0 +1,24 @@
+from csv import DictReader
+from logging import getLogger
+from logging.config import fileConfig
+from os import listdir
+from pathlib import Path
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
+class Source:
+    def __init__(self):
+        '''
+        Check if the same feature name is used in two different feature sources
+        '''
+        logger.info('Check for redondant feature names')
+        csv_files = Path.cwd() / 'config' / 'features'
+        list_of_names = []
+        for csv_file in listdir(csv_files):
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                list_of_names.extend([row['name'] for row in reader])
+        if len(list_of_names) != len(set(list_of_names)):
+            raise ValueError("At least two features have the same name")
\ No newline at end of file
diff --git a/predictops/target/all.py b/predictops/target/all.py
new file mode 100644
index 0000000..0d9d72b
--- /dev/null
+++ b/predictops/target/all.py
@@ -0,0 +1,18 @@
+from csv import DictReader
+
+class All:
+
+    _start = None
+    _end   = None
+
+    def __init__(self, stream_file = None):
+        self._stream_file = stream_file
+        self._get_located_interventions()
+
+
+    def _get_located_interventions(self):
+        with open(self._stream_file) as f:
+            reader = DictReader(f, delimiter=',')
+            for row in reader:
+                print(row)
+