]> AND Private Git Repository - predictops.git/blob - predictops/learn/preprocessing.py
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
Adding a source module to check for redundancy in feature names.
[predictops.git] / predictops / learn / preprocessing.py
1 from configparser import ConfigParser
2 from csv import DictReader
3 from datetime import datetime, timedelta
4 from itertools import chain
5 from logging import getLogger
6 from logging.config import fileConfig
7 from os import listdir
8 from pathlib import Path
9
10 import numpy as np
11 import pandas as pd
12
13 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
14 logger = getLogger()
15
16 class Preprocessing:
17     '''
18     Generate a pandas dataframe from a dictionary of features per datetime, which
19     respects the starting and ending dates of the study, and its precision (the
20     time step) as passed to the constructor. Missing feature values are completed.
21
22      - Missing datetimes are added first with np.NaN feature values,
23      - The dataframe is then constructed based on the filled feature dictionary,
24      - NaN values are then filled with last known values.
25     '''
26
27     def __init__(self, config_file = None, dict_features = None, features = None):
28         '''
29         Constructor that defines all needed attributes and collects features.
30         '''
31         self._config = ConfigParser()
32         self._config.read(config_file)
33
34         self._start = datetime.strptime(self._config['DATETIME']['start'],
35                                         '%m/%d/%Y %H:%M:%S')
36         self._end = datetime.strptime(self._config['DATETIME']['end'],
37                                         '%m/%d/%Y %H:%M:%S')
38         self._timestep = timedelta(hours =
39                                    self._config['DATETIME'].getfloat('hourStep'))
40         self._dict_features = dict_features
41         self._full_dict = None
42         self._dataframe = None
43         self._datetimes = []
44         # If features are not provided to the constructor, then we collect
45         # any existing feature in the dictionary
46         if features != None:
47             self._features = features
48         else:
49             self._features = set(chain.from_iterable([tuple(u.keys())
50                                                       for u in [*dict_features.values()]]))
51         for csv_file in listdir():
52             with open(csv_file, "r") as f:
53                 reader = DictReader(f, delimiter=',')
54                 dico_features = {{row['name']: row['type']  # qualitative (2) or quantitative (1)
55                                     }
56                                 for row in reader if row['name'] in self._features}
57
58         self._features = {feat : None for feat in self._features}
59         print(self._features)
60         exit()
61
62
63     @property
64     def start(self):
65         return self._start
66
67     @start.setter
68     def start(self, x):
69         self._start = x
70
71
72     @property
73     def end(self):
74         return self._end
75
76     @end.setter
77     def end(self, x):
78         self._end = x
79
80
81     @property
82     def timestep(self):
83         return self._timestep
84
85     @timestep.setter
86     def timestep(self, x):
87         self._timestep = x
88
89
90     def _fill_dict(self):
91         '''
92         Add datetime keys in the dated feature dictionary that are missing. The
93         features are then set to np.NaN. Add missing features in existing datetimes
94         too.
95         '''
96         logger.info("Adding missing dates and filling missing features with NaN values")
97         current = self._start
98         while current <= self._end:
99             self._datetimes.append(current)
100             if current not in self._dict_features:
101                 self._dict_features[current] = {feature:np.NaN
102                                                 for feature in self._features}
103             else:
104                 null_dict = {feature:np.NaN
105                              for feature in self._features}
106                 null_dict.update(self._dict_features[current])
107                 self._dict_features[current] = null_dict
108             current += self._timestep
109         for k in self._dict_features:
110             null_dict = {feature:np.NaN
111                          for feature in self._features}
112             null_dict.update(self._dict_features[k])
113             self._dict_features[k] = null_dict
114
115         self._full_dict = {k: self._dict_features[k]
116                            for k in sorted(self._dict_features.keys())}
117
118
119
120     @property
121     def full_dict(self):
122         '''
123         Returns the fully filled dated feature dictionary, ordered by datetimes
124         '''
125         if self._full_dict is None:
126             self._fill_dict()
127         return self._full_dict
128
129
130
131     @property
132     def dataframe(self):
133         '''
134         Returns the feature dataframe, after creating it if needed.
135         '''
136         if self._dataframe is None:
137             logger.info("Creating feature dataframe from feature dictionary")
138             self._dataframe = pd.DataFrame.from_dict(self.full_dict,
139                                                      orient='index')
140             logger.info("Filling NaN values in the feature dataframe")
141
142             if self._config['PREPROCESSING']['fill_method'] == 'propagate':
143                 self._dataframe = self._dataframe.fillna(method='ffill')
144             elif self._config['PREPROCESSING']['fill_method'] == 'linear':
145                 self._dataframe = self._dataframe.interpolate()
146             elif self._config['PREPROCESSING']['fill_method'] == 'spline':
147                 self._dataframe = self._dataframe.interpolate(method='spline',
148                                                               order=self._config['PREPROCESSING'].getint('order'))
149
150             # Uncomment this line to fill NaN values at the beginning of the
151             # dataframe. This may not be a good idea, especially for features
152             # that are available only for recent years, e.g., air quality
153             #self._dataframe = self._dataframe.fillna(method='bfill')
154
155             self._dataframe = self._dataframe.drop([k.to_pydatetime()
156                                                    for k in self._dataframe.T
157                                                    if k not in self._datetimes])
158         return self._dataframe
159
160
161     @dataframe.setter
162     def dataframe(self, df):
163         self._dataframe = df
164
165