From: Christophe Guyeux Date: Sat, 15 Feb 2020 08:33:35 +0000 (+0100) Subject: From dict to dataframe: done X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/predictops.git/commitdiff_plain/910a056eaa0181df00d21fa836f3c68504051717?ds=sidebyside;hp=3549f1fd0518f1c52df39247cc472b6158485ede From dict to dataframe: done --- diff --git a/.gitignore b/.gitignore index 82e566c..69770e3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,6 @@ __pycache__/ **.py[cod] **$py.class -data/ archives/ bonnes_pratiques.txt diff --git a/main.py b/main.py index b43c188..fe8ed20 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,9 @@ from predictops.source.ephemeris import Ephemeris from predictops.source.meteofrance import MeteoFrance +from predictops.learn.preprocessing import Preprocessing from predictops.target.toarea import ToArea -from datetime import datetime +from datetime import datetime, timedelta from logging import getLogger from logging.config import fileConfig from pathlib import Path @@ -16,7 +17,6 @@ logger = getLogger() class Engine: def __init__(self, start = None, end = None, time_step = None): - logger.info("Predictops engine launched") self._X = {} self._Y = {} @@ -69,11 +69,19 @@ engine.add_feature(name = 'ephemeris', features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear', 'weekInYear', 'month', 'year']) -print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) +process = Preprocessing(dict_features = engine.X, + start = start, end = end, timestep = timedelta(hours=1)) + +process.fill_na() +print(process.dataframe.head(n=20)) +#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')]) + +exit() depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp") Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0] ToArea(area=Doubs.geometry, - start = start, end = end) + start = start, end = end, + csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv') diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py new file mode 100644 index 0000000..b58ffac --- /dev/null +++ b/predictops/learn/preprocessing.py @@ -0,0 +1,59 @@ +from itertools import chain +from logging import getLogger +from logging.config import fileConfig +from pathlib import Path + +import numpy as np +import pandas as pd + +fileConfig((Path.cwd() / 'config') / 'logging.cfg') +logger = getLogger() + +class Preprocessing: + def __init__(self, dict_features, + start, end, timestep, + features = None): + self._dict_features = dict_features + self._start = start + self._end = end + self._timestep = timestep + self._dataframe = None + + if features != None: + self._features = features + else: + self._features = set(chain.from_iterable([tuple(u.keys()) + for u in [*dict_features.values()]])) + + + def _fill_dict(self): + current = self._start + while current <= self._end: + if current not in self._dict_features: + self._dict_features[current] = {feature:np.NaN for feature in self._features} + else: + null_dict = {feature:np.NaN for feature in self._features} + null_dict.update(self._dict_features[current]) + self._dict_features[current] = null_dict + current += self._timestep + + + @property + def full_dict(self): + self._fill_dict() + return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())} + + + @property + def dataframe(self): + if self._dataframe is None: + self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index') + return self._dataframe + + @dataframe.setter + def dataframe(self, df): + self._dataframe = df + + + def fill_na(self): + self.dataframe = self.dataframe.fillna(method='ffill') \ No newline at end of file diff --git a/predictops/target/toarea.py b/predictops/target/toarea.py index 72a8ad0..1454e9d 100644 --- a/predictops/target/toarea.py +++ b/predictops/target/toarea.py @@ -1,46 +1,24 @@ from csv import DictReader from datetime import datetime -from os import listdir -from pathlib import Path class ToArea: def __init__(self, area = None, start = datetime.strptime('19960101000000', '%Y%m%d%H%M%S'), - end = datetime.now()): - self._get_located_interventions() + end = datetime.now(), + csv_file = None): + self._area = area + self._csv_file = csv_file + self._get_located_interventions() def _get_located_interventions(self): - self._data_directory = Path.cwd() / 'data' / 'targets' / 'sdis25' - self._dict_interv = {} - for year in range(2006,2018): - if year < 2012: - file_place = self._data_directory / 'interventions' / (str(year)+'.csv') - else: - file_place = self._data_directory / 'victims' / ('Liste_des_victimes_'+str(year)+'.csv') - with open(file_place, "r") as f: - reader = DictReader(f, delimiter='£') - for row in reader: - self._dict_interv.update({ - row['N° intervention']: { - 'X' : row['Coord X'], - 'Y' : row['Coord Y'] - } for row in reader - }) - for csv_file in listdir(self._data_directory / 'interventions'): - with open(self._data_directory / 'interventions' / csv_file, "r") as f: - reader = DictReader(f, delimiter='£') - for row in reader: - if row['N° intervention'] in self._dict_interv: - self._dict_interv[row['N° intervention']].update( - { - 'start': row['Début'], - 'end' : row['Fin'] - }) - else: - print(row['N° intervention']) + with open(self._csv_file) as f: + reader = DictReader(f, delimiter=',') + for row in reader: + print(row) +