From: Christophe Guyeux Date: Mon, 17 Feb 2020 11:07:41 +0000 (+0100) Subject: Adding a source module to check for redundancy in feature names. X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/2c5695839a5064f584ffeaba557020ab3270b7b9?ds=inline Adding a source module to check for redundancy in feature names. --- diff --git a/main.py b/main.py index 426d3b5..9e35b2d 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ from predictops.engine import Engine from predictops.learn.preprocessing import Preprocessing +from predictops.target.all import All from predictops.target.toarea import ToArea from logging import getLogger @@ -24,6 +25,11 @@ if __name__ == '__main__': #print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) print(process.dataframe.head(n=20)) print(process.dataframe.tail(n=20)) + + + target = All(stream_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv') + + exit() depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp") diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index 5400d1d..a878a82 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -1,8 +1,10 @@ from configparser import ConfigParser +from csv import DictReader from datetime import datetime, timedelta from itertools import chain from logging import getLogger from logging.config import fileConfig +from os import listdir from pathlib import Path import numpy as np @@ -46,6 +48,16 @@ class Preprocessing: else: self._features = set(chain.from_iterable([tuple(u.keys()) for u in [*dict_features.values()]])) + for csv_file in listdir(): + with open(csv_file, "r") as f: + reader = DictReader(f, delimiter=',') + dico_features = {{row['name']: row['type'] # qualitative (2) or quantitative (1) + } + for row in reader if row['name'] in self._features} + + self._features = {feat : None for feat in self._features} + print(self._features) + exit() @property @@ -134,7 +146,11 @@ class Preprocessing: elif self._config['PREPROCESSING']['fill_method'] == 'spline': self._dataframe = self._dataframe.interpolate(method='spline', order=self._config['PREPROCESSING'].getint('order')) - self._dataframe = self._dataframe.fillna(method='bfill') + + # Uncomment this line to fill NaN values at the beginning of the + # dataframe. This may not be a good idea, especially for features + # that are available only for recent years, e.g., air quality + #self._dataframe = self._dataframe.fillna(method='bfill') self._dataframe = self._dataframe.drop([k.to_pydatetime() for k in self._dataframe.T diff --git a/predictops/source/ephemeris.py b/predictops/source/ephemeris.py index e46b296..d0e4ca0 100644 --- a/predictops/source/ephemeris.py +++ b/predictops/source/ephemeris.py @@ -15,6 +15,9 @@ class Ephemeris: def __init__(self, config_file): + # Check for the integrity of feature names + super(Source, self).__init__() + self._config = ConfigParser() self._config.read(config_file) diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py index afe18ad..3d8ae88 100644 --- a/predictops/source/meteofrance.py +++ b/predictops/source/meteofrance.py @@ -1,3 +1,5 @@ +from .source import Source + from configparser import ConfigParser from csv import DictReader from datetime import datetime @@ -19,7 +21,7 @@ logger = getLogger() CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv' -class MeteoFrance: +class MeteoFrance(Source): _latitude = None _longitude = None @@ -50,6 +52,9 @@ class MeteoFrance: to their names in meteofrance_features.csv (cf. config directory) ''' + # Check for the integrity of feature names + super(Source, self).__init__() + self._config = ConfigParser() self._config.read(config_file) diff --git a/predictops/source/source.py b/predictops/source/source.py new file mode 100644 index 0000000..714ed12 --- /dev/null +++ b/predictops/source/source.py @@ -0,0 +1,24 @@ +from csv import DictReader +from logging import getLogger +from logging.config import fileConfig +from os import listdir +from pathlib import Path + +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + + +class Source: + def __init__(self): + ''' + Check if the same feature name is used in two different feature sources + ''' + logger.info('Check for redondant feature names') + csv_files = Path.cwd() / 'config' / 'features' + list_of_names = [] + for csv_file in listdir(csv_files): + with open(csv_file, "r") as f: + reader = DictReader(f, delimiter=',') + list_of_names.extend([row['name'] for row in reader]) + if len(list_of_names) != len(set(list_of_names)): + raise ValueError("At least two features have the same name") \ No newline at end of file diff --git a/predictops/target/all.py b/predictops/target/all.py new file mode 100644 index 0000000..0d9d72b --- /dev/null +++ b/predictops/target/all.py @@ -0,0 +1,18 @@ +from csv import DictReader + +class All: + + _start = None + _end = None + + def __init__(self, stream_file = None): + self._stream_file = stream_file + self._get_located_interventions() + + + def _get_located_interventions(self): + with open(self._stream_file) as f: + reader = DictReader(f, delimiter=',') + for row in reader: + print(row) +