]> AND Private Git Repository - predictops.git/blobdiff - predictops/learn/preprocessing.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Starting to investigate the fact that qualitative features with NaN
[predictops.git] / predictops / learn / preprocessing.py
index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..49d7ef89bc2b2644f34f2022c3ee53d9827db98f 100644 (file)
@@ -1,8 +1,10 @@
 from configparser import ConfigParser
+from csv import DictReader
 from datetime import datetime, timedelta
 from itertools import chain
 from logging import getLogger
 from logging.config import fileConfig
+from os import listdir
 from pathlib import Path
 
 import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
         else:
             self._features = set(chain.from_iterable([tuple(u.keys())
                                                       for u in [*dict_features.values()]]))
+        csv_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : None for feat in self._features}
+        for csv_file in listdir(csv_files):
+            with open(csv_files / csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                for row in reader:
+                    if row['name'] in self._features:
+                        self._features[row['name']] = row['type']
+        print(self._features)
+        exit()
 
 
     @property
@@ -134,7 +146,11 @@ class Preprocessing:
             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                 self._dataframe = self._dataframe.interpolate(method='spline',
                                                               order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
 
             self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                    for k in self._dataframe.T