]> AND Private Git Repository - predictops.git/blob - predictops/source/source.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Standardization and one hot encoding
[predictops.git] / predictops / source / source.py
1 from configparser import ConfigParser
2 from csv import DictReader
3 from logging import getLogger
4 from logging.config import fileConfig
5 from os import listdir
6 from pathlib import Path
7
8 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
9 logger = getLogger()
10
11
12 class Source:
13     def __init__(self):
14         '''
15         Check if the same feature name is used in two different feature sources,
16         and if the sources of type 3 (being both categorical and numerical) have
17         a specified type in the feature_...cfg file
18         '''
19         logger.info('Check for redondant feature names')
20         feature_files = Path.cwd() / 'config' / 'features'
21         list_of_names = []
22         for file_name in listdir(feature_files ):
23             if file_name.endswith('csv'):
24                 with open(feature_files  / file_name, "r") as f:
25                     reader = DictReader(f, delimiter=',')
26                     list_of_names.extend([row['name'] for row in reader])
27
28         if len(list_of_names) != len(set(list_of_names)):
29             raise ValueError("At least two features have the same name")
30
31         logger.info('Check for specified feature types')
32         names_of_mixed_types = []
33         for file_name in listdir(feature_files):
34             if file_name.endswith('csv'):
35                 with open(feature_files  / file_name, "r") as f:
36                     reader = DictReader(f, delimiter=',')
37                     names_of_mixed_types.extend([row['name'] for row in reader
38                                                  if row['type'] == '3'])
39
40         cfg_names_of_mixed_types = []
41         for file_name in listdir(feature_files):
42             if file_name.endswith('cfg'):
43                 config = ConfigParser()
44                 config.read(feature_files / file_name)
45                 for section in config:
46                     if config.has_option(section, 'numerical'):
47                         cfg_names_of_mixed_types.append(section)
48
49         if sorted(names_of_mixed_types) != sorted(cfg_names_of_mixed_types):
50             raise ValueError(f"Problem with features of mixed types: "
51                              f"{set(names_of_mixed_types).symmetric_difference(cfg_names_of_mixed_types)}")