From: Christophe Guyeux Date: Sat, 15 Feb 2020 10:02:10 +0000 (+0100) Subject: Improving csv -> dataframe module X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/d6469a787c80df2c938f21d4ae107b84213e238f?ds=sidebyside;hp=910a056eaa0181df00d21fa836f3c68504051717 Improving csv -> dataframe module --- diff --git a/main.py b/main.py index fe8ed20..a42ce0f 100644 --- a/main.py +++ b/main.py @@ -55,7 +55,7 @@ class Engine: self._X = x -start = datetime.strptime('01/01/2010 00:00:00', '%m/%d/%Y %H:%M:%S') +start = datetime.strptime('01/01/2010 01:00:00', '%m/%d/%Y %H:%M:%S') end = datetime.strptime('12/31/2010 23:00:00', '%m/%d/%Y %H:%M:%S') engine = Engine() @@ -64,18 +64,20 @@ engine.add_feature(name = 'meteofrance', latitude = 47.25, longitude = 6.0333, nb_stations = 3, features = ['temperature', 'pressure']) + engine.add_feature(name = 'ephemeris', start = start, end = end, features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear', 'weekInYear', 'month', 'year']) + process = Preprocessing(dict_features = engine.X, - start = start, end = end, timestep = timedelta(hours=1)) + start = start, end = end, timestep = timedelta(hours=6)) -process.fill_na() -print(process.dataframe.head(n=20)) -#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) +df = process.dataframe.head(n=20) +#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) +print(df) exit() depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp") diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py index b58ffac..833e483 100644 --- a/predictops/learn/preprocessing.py +++ b/predictops/learn/preprocessing.py @@ -10,15 +10,32 @@ fileConfig((Path.cwd() / 'config') / 'logging.cfg') logger = getLogger() class Preprocessing: + ''' + Generate a pandas dataframe from a dictionary of features per datetime, which + respects the starting and ending dates of the study, and its precision (the + time step) as passed to the constructor. Missing feature values are completed. + + - Missing datetimes are added first with np.NaN feature values, + - The dataframe is then constructed based on the filled feature dictionary, + - NaN values are then filled with last known values. + + ''' def __init__(self, dict_features, start, end, timestep, features = None): + ''' + Constructor that defines all needed attributes and collects features. + ''' + logger.info("Entering NaN values in the feature dataframe") self._dict_features = dict_features self._start = start self._end = end self._timestep = timestep + self._full_dict = None self._dataframe = None - + self._datetimes = [] + # If features are not provided to the constructor, then we collect + # any existing feature in the dictionary if features != None: self._features = features else: @@ -27,27 +44,62 @@ class Preprocessing: def _fill_dict(self): + ''' + Add datetime keys in the dated feature dictionary that are missing. The + features are then set to np.NaN. Add missing features in existing datetimes + too. + ''' + logger.info("Adding missing dates and filling missing features with NaN values") current = self._start while current <= self._end: + self._datetimes.append(current) if current not in self._dict_features: - self._dict_features[current] = {feature:np.NaN for feature in self._features} + self._dict_features[current] = {feature:np.NaN + for feature in self._features} else: - null_dict = {feature:np.NaN for feature in self._features} + null_dict = {feature:np.NaN + for feature in self._features} null_dict.update(self._dict_features[current]) self._dict_features[current] = null_dict current += self._timestep + for k in self._dict_features: + null_dict = {feature:np.NaN + for feature in self._features} + null_dict.update(self._dict_features[k]) + self._dict_features[k] = null_dict + + self._full_dict = {k: self._dict_features[k] + for k in sorted(self._dict_features.keys())} + @property def full_dict(self): - self._fill_dict() - return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())} + ''' + Returns the fully filled dated feature dictionary, ordered by datetimes + ''' + if self._full_dict is None: + self._fill_dict() + return self._full_dict + @property def dataframe(self): + ''' + Returns the feature dataframe, after creating it if needed. + ''' if self._dataframe is None: - self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index') + logger.info("Creating feature dataframe from feature dictionary") + self._dataframe = pd.DataFrame.from_dict(self.full_dict, + orient='index') + logger.info("Filling NaN values in the feature dataframe") + #TODO: add other filling methods like linear interpolation + self._dataframe = self._dataframe.fillna(method='ffill') + self._dataframe = self._dataframe.fillna(method='bfill') + self._dataframe = self._dataframe.drop([k.to_pydatetime() + for k in self._dataframe.T + if k not in self._datetimes]) return self._dataframe @dataframe.setter @@ -55,5 +107,3 @@ class Preprocessing: self._dataframe = df - def fill_na(self): - self.dataframe = self.dataframe.fillna(method='ffill') \ No newline at end of file diff --git a/predictops/source/meteofrance.py b/predictops/source/meteofrance.py index 2326e16..5a885ee 100644 --- a/predictops/source/meteofrance.py +++ b/predictops/source/meteofrance.py @@ -208,13 +208,16 @@ class MeteoFrance: self._dated_features = {} for csv_meteo in listdir(dir_data): date = datetime.strptime(csv_meteo.split('.')[1], '%Y%m') - if date >= self._start and date <= self._end: + if (date >= self._start and date <= self._end)\ + or (date.year == self._start.year and date.month == self._start.month)\ + or (date.year == self._end.year and date.month == self._end.month): logger.info(f'Inserting {csv_meteo} in intervention dictionary') with open(dir_data / csv_meteo, "r") as f: reader = DictReader(f, delimiter=';') for row in reader: if row['numer_sta'] in self._stations: date = datetime.strptime(row['date'], '%Y%m%d%H%M%S') - self._dated_features.setdefault(date,{}).update({dico_features[feat]['name']+'_'+str(self._stations.index(row['numer_sta'])): eval(row[feat].replace('mq','None')) for feat in dico_features}) + if date >= self._start and date <= self._end: + self._dated_features.setdefault(date,{}).update({dico_features[feat]['name']+'_'+str(self._stations.index(row['numer_sta'])): eval(row[feat].replace('mq','None')) for feat in dico_features}) return self._dated_features