]> AND Private Git Repository - predictops.git/blobdiff - predictops/learn/preprocessing.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Starting to investigate the fact that qualitative features with NaN
[predictops.git] / predictops / learn / preprocessing.py
index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..49d7ef89bc2b2644f34f2022c3ee53d9827db98f 100644 (file)
@@ -1,8 +1,10 @@
 from configparser import ConfigParser
 from configparser import ConfigParser
+from csv import DictReader
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
+from os import listdir
 from pathlib import Path
 
 import numpy as np
 from pathlib import Path
 
 import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
+        csv_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : None for feat in self._features}
+        for csv_file in listdir(csv_files):
+            with open(csv_files / csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                for row in reader:
+                    if row['name'] in self._features:
+                        self._features[row['name']] = row['type']
+        print(self._features)
+        exit()
 
 
     @property
 
 
     @property
@@ -134,7 +146,11 @@ class Preprocessing:
             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                 self._dataframe = self._dataframe.interpolate(method='spline',
                                                               order=self._config['PREPROCESSING'].getint('order'))
             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                 self._dataframe = self._dataframe.interpolate(method='spline',
                                                               order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T