Standardization and one hot encoding

[predictops.git] / predictops / source / source.py
diff --git a/predictops/source/source.py b/predictops/source/source.py

index 8e68716e3b85f74767c6067f6b9ea866ecd0cfc9..70f24da645e932495787804418af825fb86f7b9a 100644 (file)
--- a/predictops/source/source.py
+++ b/predictops/source/source.py
@@ -1,3 +1,4 @@
+from configparser import ConfigParser
  from csv import DictReader
  from logging import getLogger
  from logging.config import fileConfig
  from csv import DictReader
  from logging import getLogger
  from logging.config import fileConfig
@@ -11,14 +12,40 @@ logger = getLogger()
  class Source:
      def __init__(self):
          '''
  class Source:
      def __init__(self):
          '''
-        Check if the same feature name is used in two different feature sources
+        Check if the same feature name is used in two different feature sources,
+        and if the sources of type 3 (being both categorical and numerical) have
+        a specified type in the feature_...cfg file
          '''
          logger.info('Check for redondant feature names')
          '''
          logger.info('Check for redondant feature names')
-        csv_files = Path.cwd() / 'config' / 'features'
+        feature_files = Path.cwd() / 'config' / 'features'
          list_of_names = []
          list_of_names = []
-        for csv_file in listdir(csv_files):
-            with open(csv_files / csv_file, "r") as f:
-                reader = DictReader(f, delimiter=',')
-                list_of_names.extend([row['name'] for row in reader])
+        for file_name in listdir(feature_files ):
+            if file_name.endswith('csv'):
+                with open(feature_files  / file_name, "r") as f:
+                    reader = DictReader(f, delimiter=',')
+                    list_of_names.extend([row['name'] for row in reader])
+
          if len(list_of_names) != len(set(list_of_names)):
          if len(list_of_names) != len(set(list_of_names)):
-            raise ValueError("At least two features have the same name")
-\ No newline at end of file
+            raise ValueError("At least two features have the same name")
+
+        logger.info('Check for specified feature types')
+        names_of_mixed_types = []
+        for file_name in listdir(feature_files):
+            if file_name.endswith('csv'):
+                with open(feature_files  / file_name, "r") as f:
+                    reader = DictReader(f, delimiter=',')
+                    names_of_mixed_types.extend([row['name'] for row in reader
+                                                 if row['type'] == '3'])
+
+        cfg_names_of_mixed_types = []
+        for file_name in listdir(feature_files):
+            if file_name.endswith('cfg'):
+                config = ConfigParser()
+                config.read(feature_files / file_name)
+                for section in config:
+                    if config.has_option(section, 'numerical'):
+                        cfg_names_of_mixed_types.append(section)
+
+        if sorted(names_of_mixed_types) != sorted(cfg_names_of_mixed_types):
+            raise ValueError(f"Problem with features of mixed types: "
+                             f"{set(names_of_mixed_types).symmetric_difference(cfg_names_of_mixed_types)}")