self._X = x
-start = datetime.strptime('01/01/2010 00:00:00', '%m/%d/%Y %H:%M:%S')
+start = datetime.strptime('01/01/2010 01:00:00', '%m/%d/%Y %H:%M:%S')
end = datetime.strptime('12/31/2010 23:00:00', '%m/%d/%Y %H:%M:%S')
engine = Engine()
latitude = 47.25, longitude = 6.0333, nb_stations = 3,
features = ['temperature', 'pressure'])
+
engine.add_feature(name = 'ephemeris',
start = start, end = end,
features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear',
'weekInYear', 'month', 'year'])
+
process = Preprocessing(dict_features = engine.X,
- start = start, end = end, timestep = timedelta(hours=1))
+ start = start, end = end, timestep = timedelta(hours=6))
-process.fill_na()
-print(process.dataframe.head(n=20))
-#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+df = process.dataframe.head(n=20)
+#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+print(df)
exit()
depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
logger = getLogger()
class Preprocessing:
+ '''
+ Generate a pandas dataframe from a dictionary of features per datetime, which
+ respects the starting and ending dates of the study, and its precision (the
+ time step) as passed to the constructor. Missing feature values are completed.
+
+ - Missing datetimes are added first with np.NaN feature values,
+ - The dataframe is then constructed based on the filled feature dictionary,
+ - NaN values are then filled with last known values.
+
+ '''
def __init__(self, dict_features,
start, end, timestep,
features = None):
+ '''
+ Constructor that defines all needed attributes and collects features.
+ '''
+ logger.info("Entering NaN values in the feature dataframe")
self._dict_features = dict_features
self._start = start
self._end = end
self._timestep = timestep
+ self._full_dict = None
self._dataframe = None
-
+ self._datetimes = []
+ # If features are not provided to the constructor, then we collect
+ # any existing feature in the dictionary
if features != None:
self._features = features
else:
def _fill_dict(self):
+ '''
+ Add datetime keys in the dated feature dictionary that are missing. The
+ features are then set to np.NaN. Add missing features in existing datetimes
+ too.
+ '''
+ logger.info("Adding missing dates and filling missing features with NaN values")
current = self._start
while current <= self._end:
+ self._datetimes.append(current)
if current not in self._dict_features:
- self._dict_features[current] = {feature:np.NaN for feature in self._features}
+ self._dict_features[current] = {feature:np.NaN
+ for feature in self._features}
else:
- null_dict = {feature:np.NaN for feature in self._features}
+ null_dict = {feature:np.NaN
+ for feature in self._features}
null_dict.update(self._dict_features[current])
self._dict_features[current] = null_dict
current += self._timestep
+ for k in self._dict_features:
+ null_dict = {feature:np.NaN
+ for feature in self._features}
+ null_dict.update(self._dict_features[k])
+ self._dict_features[k] = null_dict
+
+ self._full_dict = {k: self._dict_features[k]
+ for k in sorted(self._dict_features.keys())}
+
@property
def full_dict(self):
- self._fill_dict()
- return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())}
+ '''
+ Returns the fully filled dated feature dictionary, ordered by datetimes
+ '''
+ if self._full_dict is None:
+ self._fill_dict()
+ return self._full_dict
+
@property
def dataframe(self):
+ '''
+ Returns the feature dataframe, after creating it if needed.
+ '''
if self._dataframe is None:
- self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index')
+ logger.info("Creating feature dataframe from feature dictionary")
+ self._dataframe = pd.DataFrame.from_dict(self.full_dict,
+ orient='index')
+ logger.info("Filling NaN values in the feature dataframe")
+ #TODO: add other filling methods like linear interpolation
+ self._dataframe = self._dataframe.fillna(method='ffill')
+ self._dataframe = self._dataframe.fillna(method='bfill')
+ self._dataframe = self._dataframe.drop([k.to_pydatetime()
+ for k in self._dataframe.T
+ if k not in self._datetimes])
return self._dataframe
@dataframe.setter
self._dataframe = df
- def fill_na(self):
- self.dataframe = self.dataframe.fillna(method='ffill')
\ No newline at end of file
self._dated_features = {}
for csv_meteo in listdir(dir_data):
date = datetime.strptime(csv_meteo.split('.')[1], '%Y%m')
- if date >= self._start and date <= self._end:
+ if (date >= self._start and date <= self._end)\
+ or (date.year == self._start.year and date.month == self._start.month)\
+ or (date.year == self._end.year and date.month == self._end.month):
logger.info(f'Inserting {csv_meteo} in intervention dictionary')
with open(dir_data / csv_meteo, "r") as f:
reader = DictReader(f, delimiter=';')
for row in reader:
if row['numer_sta'] in self._stations:
date = datetime.strptime(row['date'], '%Y%m%d%H%M%S')
- self._dated_features.setdefault(date,{}).update({dico_features[feat]['name']+'_'+str(self._stations.index(row['numer_sta'])): eval(row[feat].replace('mq','None')) for feat in dico_features})
+ if date >= self._start and date <= self._end:
+ self._dated_features.setdefault(date,{}).update({dico_features[feat]['name']+'_'+str(self._stations.index(row['numer_sta'])): eval(row[feat].replace('mq','None')) for feat in dico_features})
return self._dated_features