From dict to dataframe: done

author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)

committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>

Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
author Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
committer Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
diff --git a/.gitignore b/.gitignore

index 82e566ca3c6e178d6249cd4b72928d7d0fe166be..69770e32cad6c4e19bdce4c5f558620c59598911 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
  
  **.py[cod]
  **$py.class
-data/
  archives/
  
  bonnes_pratiques.txt
diff --git a/main.py b/main.py

index b43c188627d52c218cb527d5dce3ee16dde1575d..fe8ed203fe9fd78a2f61b98fed7a0474a4b3d3bd 100644 (file)
--- a/main.py
+++ b/main.py
@@ -1,8 +1,9 @@
  from predictops.source.ephemeris import Ephemeris
  from predictops.source.meteofrance import MeteoFrance
+from predictops.learn.preprocessing import Preprocessing
  from predictops.target.toarea import ToArea
  
-from datetime import datetime
+from datetime import datetime, timedelta
  from logging import getLogger
  from logging.config import fileConfig
  from pathlib import Path
@@ -16,7 +17,6 @@ logger = getLogger()
  
  class Engine:
      def __init__(self, start = None, end = None, time_step = None):
-        logger.info("Predictops engine launched")
          self._X = {}
          self._Y = {}
  
@@ -69,11 +69,19 @@ engine.add_feature(name = 'ephemeris',
                     features = ['hour', 'dayInWeek', 'dayInMonth', 'dayInYear',
                                 'weekInYear', 'month', 'year'])
  
-print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+process = Preprocessing(dict_features = engine.X,
+                   start = start, end = end, timestep = timedelta(hours=1))
+
+process.fill_na()
+print(process.dataframe.head(n=20))
+#print(engine.X[datetime.strptime('06/30/2010 21:00:00', '%m/%d/%Y %H:%M:%S')])
+
+exit()
  
  depts = gpd.read_file( Path.cwd() / 'data' / 'targets' / 'departments' / "departements-20180101.shp")
  Doubs = depts.loc[depts['nom'] == 'Doubs'].iloc[0]
  
  ToArea(area=Doubs.geometry,
-       start = start, end = end)
+       start = start, end = end,
+       csv_file = Path.cwd() / 'data' / 'targets' / 'sdis25' / 'interventions.csv')
  
diff --git a/predictops/learn/preprocessing.py b/predictops/learn/preprocessing.py

new file mode 100644 (file)

index 0000000..b58ffac
--- /dev/null
+++ b/predictops/learn/preprocessing.py
@@ -0,0 +1,59 @@
+from itertools import chain
+from logging import getLogger
+from logging.config import fileConfig
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+fileConfig((Path.cwd() / 'config') / 'logging.cfg')
+logger = getLogger()
+
+class Preprocessing:
+    def __init__(self, dict_features,
+                 start, end, timestep,
+                 features = None):
+        self._dict_features = dict_features
+        self._start = start
+        self._end = end
+        self._timestep = timestep
+        self._dataframe = None
+
+        if features != None:
+            self._features = features
+        else:
+            self._features = set(chain.from_iterable([tuple(u.keys())
+                                                      for u in [*dict_features.values()]]))
+
+
+    def _fill_dict(self):
+        current = self._start
+        while current <= self._end:
+            if current not in self._dict_features:
+                self._dict_features[current] = {feature:np.NaN for feature in self._features}
+            else:
+                null_dict = {feature:np.NaN for feature in self._features}
+                null_dict.update(self._dict_features[current])
+                self._dict_features[current] = null_dict
+            current += self._timestep
+
+
+    @property
+    def full_dict(self):
+        self._fill_dict()
+        return {k: self._dict_features[k] for k in sorted(self._dict_features.keys())}
+
+
+    @property
+    def dataframe(self):
+        if self._dataframe is None:
+            self._dataframe = pd.DataFrame.from_dict(self.full_dict, orient='index')
+        return self._dataframe
+
+    @dataframe.setter
+    def dataframe(self, df):
+        self._dataframe = df
+
+
+    def fill_na(self):
+        self.dataframe = self.dataframe.fillna(method='ffill')
+\ No newline at end of file
diff --git a/predictops/target/toarea.py b/predictops/target/toarea.py

index 72a8ad03139e366e70d66f53f64e4e5ef2d4b3c2..1454e9d862a54d3d03ddf5acbbe7a597439ed6e9 100644 (file)
--- a/predictops/target/toarea.py
+++ b/predictops/target/toarea.py
@@ -1,46 +1,24 @@
  from csv import DictReader
  from datetime import datetime
-from os import listdir
-from pathlib import Path
  
  class ToArea:
  
      def __init__(self, area = None,
                   start = datetime.strptime('19960101000000', '%Y%m%d%H%M%S'),
-                 end = datetime.now()):
-        self._get_located_interventions()
+                 end = datetime.now(),
+                 csv_file = None):
  
+        self._area = area
+        self._csv_file = csv_file
+        self._get_located_interventions()
  
  
      def _get_located_interventions(self):
-        self._data_directory = Path.cwd() / 'data' / 'targets' / 'sdis25'
-        self._dict_interv = {}
-        for year in range(2006,2018):
-            if year < 2012:
-                file_place = self._data_directory / 'interventions' / (str(year)+'.csv')
-            else:
-                file_place = self._data_directory / 'victims' / ('Liste_des_victimes_'+str(year)+'.csv')
-            with open(file_place, "r") as f:
-                reader = DictReader(f, delimiter='£')
-                for row in reader:
-                    self._dict_interv.update({
-                        row['N° intervention']: {
-                            'X' : row['Coord X'],
-                            'Y' : row['Coord Y']
-                        } for row in reader
-                    })
-        for csv_file in listdir(self._data_directory / 'interventions'):
-            with open(self._data_directory / 'interventions' / csv_file, "r") as f:
-                reader = DictReader(f, delimiter='£')
-                for row in reader:
-                    if row['N° intervention'] in self._dict_interv:
-                        self._dict_interv[row['N° intervention']].update(
-                            {
-                                'start': row['Début'],
-                                'end'  : row['Fin']
-                            })
-                    else:
-                        print(row['N° intervention'])
+        with open(self._csv_file) as f:
+            reader = DictReader(f, delimiter=',')
+            for row in reader:
+                print(row)
+
author	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
committer	Christophe Guyeux <christophe.guyeux@univ-fcomte.fr>
	Sat, 15 Feb 2020 08:33:35 +0000 (09:33 +0100)
.gitignore		patch \| blob \| history
main.py		patch \| blob \| history
predictops/learn/preprocessing.py	[new file with mode: 0644]	patch \| blob
predictops/target/toarea.py		patch \| blob \| history