Starting to investigate the fact that qualitative features with NaN

[predictops.git] / predictops / learn / preprocessing.py
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

index 5400d1d39f1135ce5e2abcfec2541201cf5d8ed6..49d7ef89bc2b2644f34f2022c3ee53d9827db98f 100644 (file)
--- a/predictops/learn/preprocessing.py
+++ b/predictops/learn/preprocessing.py
@@ -1,8 +1,10 @@
  from configparser import ConfigParser
  from configparser import ConfigParser
+from csv import DictReader
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
  from datetime import datetime, timedelta
  from itertools import chain
  from logging import getLogger
  from logging.config import fileConfig
+from os import listdir
  from pathlib import Path
  
  import numpy as np
  from pathlib import Path
  
  import numpy as np
@@ -46,6 +48,16 @@ class Preprocessing:
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
          else:
              self._features = set(chain.from_iterable([tuple(u.keys())
                                                        for u in [*dict_features.values()]]))
+        csv_files = Path.cwd() / 'config' / 'features'
+        self._features = {feat : None for feat in self._features}
+        for csv_file in listdir(csv_files):
+            with open(csv_files / csv_file, "r") as f:
+                reader = DictReader(f, delimiter=',')
+                for row in reader:
+                    if row['name'] in self._features:
+                        self._features[row['name']] = row['type']
+        print(self._features)
+        exit()
  
  
      @property
  
  
      @property
@@ -134,7 +146,11 @@ class Preprocessing:
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                  self._dataframe = self._dataframe.interpolate(method='spline',
                                                                order=self._config['PREPROCESSING'].getint('order'))
              elif self._config['PREPROCESSING']['fill_method'] == 'spline':
                  self._dataframe = self._dataframe.interpolate(method='spline',
                                                                order=self._config['PREPROCESSING'].getint('order'))
-            self._dataframe = self._dataframe.fillna(method='bfill')
+
+            # Uncomment this line to fill NaN values at the beginning of the
+            # dataframe. This may not be a good idea, especially for features
+            # that are available only for recent years, e.g., air quality
+            #self._dataframe = self._dataframe.fillna(method='bfill')
  
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T
  
              self._dataframe = self._dataframe.drop([k.to_pydatetime()
                                                     for k in self._dataframe.T