Adding a source module to check for redundancy in feature names.

author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)

committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
diff --git a/main.py b/main.py

index 426d3b580fdbed6586b7111ce6ffb17c1172824a..9e35b2da9acf12c39b784454a8c420dae5ce2bb4 100644 (file)
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
  from predictops.engine import Engine
  from predictops.learn.preprocessing import Preprocessing
  from predictops.engine import Engine
  from predictops.learn.preprocessing import Preprocessing
+from predictops.target.all import All
  from predictops.target.toarea import ToArea
  
  from logging import getLogger
  from predictops.target.toarea import ToArea
  
  from logging import getLogger
@@ -24,6 +25,11 @@ if __name__ == '__main__':
      #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
      print(process.dataframe.head(n=20))
      print(process.dataframe.tail(n=20))
      #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
      print(process.dataframe.head(n=20))
      print(process.dataframe.tail(n=20))
+
+
+    target = All(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
+
+
      exit()
  
      depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
      exit()
  
      depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..a878a8215d83e8cd504ff7f345cbd1c15165a7e7 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,8 +1,10 @@
  from configparser import ConfigParser
  from configparser import ConfigParser
+from csv import DictReader
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
+from os import listdir
  from pathlib import Path
  
  import numpy as np
  from pathlib import Path
  
  import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
+        for csv_file in listdir():
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                dico_features = {{row['name']: row['type']  # qualitative (2) or quantitative (1)
+                                    }
+                                for row in reader if row['name'] in self._features}
+
+        self._features = {feat : None for feat in self._features}
+        print(self._features)
+        exit()
  
  
      @property
  
  
      @property
@@ -134,7 +146,11 @@ class Preprocessing:
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                  self._dataframe = self._dataframe.interpolate(method='spline',
                                                                order=self._config['PREPROCESSING'].getint('order'))
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                  self._dataframe = self._dataframe.interpolate(method='spline',
                                                                order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
  
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T
  
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T
diff --git a/predictops/source/ephemeris.py b/predictops/source/ephemeris.py

index e46b296ad09efd78afdff11163e43f3606ce67fc..d0e4ca06cb14a5a9ec18e17ce626a83498434647 100644 (file)
--- a/predictops/source/ephemeris.py
+++ b/predictops/source/ephemeris.py
@@ -15,6 +15,9 @@ class Ephemeris:
  
      def __init__(self, config_file):
  
  
      def __init__(self, config_file):
  
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
          self._config = ConfigParser()
          self._config.read(config_file)
  
          self._config = ConfigParser()
          self._config.read(config_file)
  
diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py

index afe18ad82e77efc3e495881fe540ad143f1d1284..3d8ae885157a9d00f49770d252afba275398f4ac 100644 (file)
--- a/predictops/source/meteofrance.py
+++ b/predictops/source/meteofrance.py
@@ -1,3 +1,5 @@
+from .source import Source
+
  from configparser import ConfigParser
  from csv import DictReader
  from datetime import datetime
  from configparser import ConfigParser
  from csv import DictReader
  from datetime import datetime
@@ -19,7 +21,7 @@ logger = getLogger()
  CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
  
  
  CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
  
  
-class MeteoFrance:
+class MeteoFrance(Source):
  
      _latitude    = None
      _longitude   = None
  
      _latitude    = None
      _longitude   = None
@@ -50,6 +52,9 @@ class MeteoFrance:
                    to their names in meteofrance_features.csv (cf. config directory)
  
          '''
                    to their names in meteofrance_features.csv (cf. config directory)
  
          '''
+        # Check for the integrity of feature names
+        super(Source, self).__init__()
+
          self._config = ConfigParser()
          self._config.read(config_file)
  
          self._config = ConfigParser()
          self._config.read(config_file)
  
diff --git a/predictops/source/source.py b/predictops/source/source.py

new file mode 100644 (file)

index 0000000..714ed12
--- /dev/null
+++ b/predictops/source/source.py
@@ -0,0 +1,24 @@
+from csv import DictReader
+from logging import getLogger
+from logging.config import fileConfig
+from os import listdir
+from pathlib import Path
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+
+class Source:
+    def __init__(self):
+        '''
+        Check if the same feature name is used in two different feature sources
+        '''
+        logger.info('Check for redondant feature names')
+        csv_files = Path.cwd() / 'config' / 'features'
+        list_of_names = []
+        for csv_file in listdir(csv_files):
+            with open(csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                list_of_names.extend([row['name'] for row in reader])
+        if len(list_of_names) != len(set(list_of_names)):
+            raise ValueError("At least two features have the same name")
+\ No newline at end of file
diff --git a/predictops/target/all.py b/predictops/target/all.py

new file mode 100644 (file)

index 0000000..0d9d72b
--- /dev/null
+++ b/predictops/target/all.py
@@ -0,0 +1,18 @@
+from csv import DictReader
+
+class All:
+
+    _start = None
+    _end   = None
+
+    def __init__(self, stream_file = None):
+        self._stream_file = stream_file
+        self._get_located_interventions()
+
+
+    def _get_located_interventions(self):
+        with open(self._stream_file) as f:
+            reader = DictReader(f, delimiter=',')
+            for row in reader:
+                print(row)
+
author	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
committer	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Mon, 17 Feb 2020 11:07:41 +0000 (12:07 +0100)
main.py		patch \| blob \| history
predictops/learn/preprocessing.py		patch \| blob \| history
predictops/source/ephemeris.py		patch \| blob \| history
predictops/source/meteofrance.py		patch \| blob \| history
predictops/source/source.py	[new file with mode: 0644]	patch \| blob
predictops/target/all.py	[new file with mode: 0644]	patch \| blob