]> AND Private Git Repository - predictops.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
From dict to dataframe: done
authorChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
committerChristophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
.gitignore
main.py
predictops/learn/preprocessing.py [new file with mode: 0644]
predictops/target/toarea.py

index 82e566ca3c6e178d6249cd4b72928d7d0fe166be..69770e32cad6c4e19bdce4c5f558620c59598911 100644 (file)
@@ -6,7 +6,6 @@ __pycache__/
 
 **.py[cod]
 **$py.class
-data/
 archives/
 
 bonnes_pratiques.txt
diff --git a/main.py b/main.py
index b43c188627d52c218cb527d5dce3ee16dde1575d..fe8ed203fe9fd78a2f61b98fed7a0474a4b3d3bd 100644 (file)
--- a/main.py
+++ b/main.py
@@ -1,8 +1,9 @@
 from predictops.source.ephemeris import Ephemeris
 from predictops.source.meteofrance import MeteoFrance
+from predictops.learn.preprocessing import Preprocessing
 from predictops.target.toarea import ToArea
 
-from datetime import datetime
+from datetime import datetime, timedelta
 from logging import getLogger
 from logging.config import fileConfig
 from pathlib import Path
@@ -16,7 +17,6 @@ logger = getLogger()
 
 class Engine:
     def __init__(self, start = None, end = None, time_step = None):
-        logger.info("Predictops engine launched")
         self._X = {}
         self._Y = {}
 
@@ -69,11 +69,19 @@ engine.add_feature(name = 'ephemeris',
                    features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear',
                                'weekInYear', 'month', 'year'])
 
-print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+process = Preprocessing(dict_features = engine.X,
+                   start = start, end = end, timestep = timedelta(hours=1))
+
+process.fill_na()
+print(process.dataframe.head(n=20))
+#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+
+exit()
 
 depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
 Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0]
 
 ToArea(area=Doubs.geometry,
-       start = start, end = end)
+       start = start, end = end,
+       csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
 
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py
new file mode 100644 (file)
index 0000000..b58ffac
--- /dev/null
@@ -0,0 +1,59 @@
+from itertools import chain
+from logging import getLogger
+from logging.config import fileConfig
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+class Preprocessing:
+    def __init__(self, dict_features,
+                 start, end, timestep,
+                 features = None):
+        self._dict_features = dict_features
+        self._start = start
+        self._end = end
+        self._timestep = timestep
+        self._dataframe = None
+
+        if features != None:
+            self._features = features
+        else:
+            self._features = set(chain.from_iterable([tuple(u.keys())
+                                                      for u in [*dict_features.values()]]))
+
+
+    def _fill_dict(self):
+        current = self._start
+        while current <= self._end:
+            if current not in self._dict_features:
+                self._dict_features[current] = {feature:np.NaN for feature in self._features}
+            else:
+                null_dict = {feature:np.NaN for feature in self._features}
+                null_dict.update(self._dict_features[current])
+                self._dict_features[current] = null_dict
+            current += self._timestep
+
+
+    @property
+    def full_dict(self):
+        self._fill_dict()
+        return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())}
+
+
+    @property
+    def dataframe(self):
+        if self._dataframe is None:
+            self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index')
+        return self._dataframe
+
+    @dataframe.setter
+    def dataframe(self, df):
+        self._dataframe = df
+
+
+    def fill_na(self):
+        self.dataframe = self.dataframe.fillna(method='ffill')
\ No newline at end of file
index 72a8ad03139e366e70d66f53f64e4e5ef2d4b3c2..1454e9d862a54d3d03ddf5acbbe7a597439ed6e9 100644 (file)
@@ -1,46 +1,24 @@
 from csv import DictReader
 from datetime import datetime
-from os import listdir
-from pathlib import Path
 
 class ToArea:
 
     def __init__(self, area = None,
                  start = datetime.strptime('19960101000000', '%Y%m%d%H%M%S'),
-                 end = datetime.now()):
-        self._get_located_interventions()
+                 end = datetime.now(),
+                 csv_file = None):
 
+        self._area = area
+        self._csv_file = csv_file
+        self._get_located_interventions()
 
 
     def _get_located_interventions(self):
-        self._data_directory = Path.cwd() / 'data' / 'targets' / 'sdis25'
-        self._dict_interv = {}
-        for year in range(2006,2018):
-            if year < 2012:
-                file_place = self._data_directory / 'interventions' / (str(year)+'.csv')
-            else:
-                file_place = self._data_directory / 'victims' / ('Liste_des_victimes_'+str(year)+'.csv')
-            with open(file_place, "r") as f:
-                reader = DictReader(f, delimiter='£')
-                for row in reader:
-                    self._dict_interv.update({
-                        row['N° intervention']: {
-                            'X' : row['Coord X'],
-                            'Y' : row['Coord Y']
-                        } for row in reader
-                    })
-        for csv_file in listdir(self._data_directory / 'interventions'):
-            with open(self._data_directory / 'interventions' / csv_file, "r") as f:
-                reader = DictReader(f, delimiter='£')
-                for row in reader:
-                    if row['N° intervention'] in self._dict_interv:
-                        self._dict_interv[row['N° intervention']].update(
-                            {
-                                'start': row['Début'],
-                                'end'  : row['Fin']
-                            })
-                    else:
-                        print(row['N° intervention'])
+        with open(self._csv_file) as f:
+            reader = DictReader(f, delimiter=',')
+            for row in reader:
+                print(row)
+