Adding a source module to check for redundancy in feature names.

author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)

committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
diff --git a/main.py b/main.py

index 426d3b580fdbed6586b7111ce6ffb17c1172824a..9e35b2da9acf12c39b784454a8c420dae5ce2bb4 100644 (file)
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
  from predictops.engine import Engine
  from predictops.learn.preprocessing import Preprocessing
+from predictops.target.all import All
  from predictops.target.toarea import ToArea
  
  from logging import getLogger
@@ -24,6 +25,11 @@ if __name__ == '__main__':
      #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
      print(process.dataframe.head(n=20))
      print(process.dataframe.tail(n=20))
+
+
+    target = All(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
+
+
      exit()
  
      depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..a878a8215d83e8cd504ff7f345cbd1c15165a7e7 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,8 +1,10 @@
  from configparser import ConfigParser
+from csv import DictReader
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
+from os import listdir
  from pathlib import Path
  
  import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
+        for csv_file in listdir():
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                dico_features = {{row['name']: row['type']  # qualitative (2) or quantitative (1)
+                                    }
+                                for row in reader if row['name'] in self._features}
+
+        self._features = {feat : None for feat in self._features}
+        print(self._features)
+        exit()
  
  
      @property
@@ -134,7 +146,11 @@ class Preprocessing:
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                  self._dataframe = self._dataframe.interpolate(method='spline',
                                                                order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
  
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T
diff --git a/predictops/source/ephemeris.py b/predictops/source/ephemeris.py

index e46b296ad09efd78afdff11163e43f3606ce67fc..d0e4ca06cb14a5a9ec18e17ce626a83498434647 100644 (file)
--- a/predictops/source/ephemeris.py
+++ b/predictops/source/ephemeris.py
@@ -15,6 +15,9 @@ class Ephemeris:
  
      def __init__(self, config_file):
  
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
          self._config = ConfigParser()
          self._config.read(config_file)
  
diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py

index afe18ad82e77efc3e495881fe540ad143f1d1284..3d8ae885157a9d00f49770d252afba275398f4ac 100644 (file)
--- a/predictops/source/meteofrance.py
+++ b/predictops/source/meteofrance.py
@@ -1,3 +1,5 @@
+from .source import Source
+
  from configparser import ConfigParser
  from csv import DictReader
  from datetime import datetime
@@ -19,7 +21,7 @@ logger = getLogger()
  CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
  
  
-class MeteoFrance:
+class MeteoFrance(Source):
  
      _latitude    = None
      _longitude   = None
@@ -50,6 +52,9 @@ class MeteoFrance:
                    to their names in meteofrance_features.csv (cf. config directory)
  
          '''
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
          self._config = ConfigParser()
          self._config.read(config_file)
  
diff --git a/predictops/source/source.py b/predictops/source/source.py

new file mode 100644 (file)

index 0000000..714ed12
--- /dev/null
+++ b/predictops/source/source.py
@@ -0,0 +1,24 @@
+from csv import DictReader
+from logging import getLogger
+from logging.config import fileConfig
+from os import listdir
+from pathlib import Path
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
+class Source:
+    def __init__(self):
+        '''
+        Check if the same feature name is used in two different feature sources
+        '''
+        logger.info('Check for redondant feature names')
+        csv_files = Path.cwd() / 'config' / 'features'
+        list_of_names = []
+        for csv_file in listdir(csv_files):
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                list_of_names.extend([row['name'] for row in reader])
+        if len(list_of_names) != len(set(list_of_names)):
+            raise ValueError("At least two features have the same name")
+\ No newline at end of file
diff --git a/predictops/target/all.py b/predictops/target/all.py

new file mode 100644 (file)

index 0000000..0d9d72b
--- /dev/null
+++ b/predictops/target/all.py
@@ -0,0 +1,18 @@
+from csv import DictReader
+
+class All:
+
+    _start = None
+    _end   = None
+
+    def __init__(self, stream_file = None):
+        self._stream_file = stream_file
+        self._get_located_interventions()
+
+
+    def _get_located_interventions(self):
+        with open(self._stream_file) as f:
+            reader = DictReader(f, delimiter=',')
+            for row in reader:
+                print(row)
+
author	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
committer	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
main.py		patch \| blob \| history
predictops/learn/preprocessing.py		patch \| blob \| history
predictops/source/ephemeris.py		patch \| blob \| history
predictops/source/meteofrance.py		patch \| blob \| history
predictops/source/source.py	[new file with mode: 0644]	patch \| blob
predictops/target/all.py	[new file with mode: 0644]	patch \| blob