Starting to investigate the fact that qualitative features with NaN

[predictops.git] / predictops / learn / preprocessing.py
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..49d7ef89bc2b2644f34f2022c3ee53d9827db98f 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,8 +1,10 @@
  from configparser import ConfigParser
+from csv import DictReader
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
+from os import listdir
  from pathlib import Path
  
  import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
+        csv_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : None for feat in self._features}
+        for csv_file in listdir(csv_files):
+            with open(csv_files / csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                for row in reader:
+                    if row['name'] in self._features:
+                        self._features[row['name']] = row['type']
+        print(self._features)
+        exit()
  
  
      @property
@@ -134,7 +146,11 @@ class Preprocessing:
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                  self._dataframe = self._dataframe.interpolate(method='spline',
                                                                order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
  
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T