predictops/source/meteofrance.py

   1 from .source import Source
   2
   3 from configparser import ConfigParser
   4 from csv import DictReader
   5 from datetime import datetime
   6 from geopy.distance import vincenty
   7 from logging import getLogger
   8 from logging.config import fileConfig
   9 from os import listdir, remove
  10 from os.path import isfile, basename
  11 from pathlib import Path
  12 from shutil import rmtree
  13 from urllib.request import urlretrieve
  14
  15 import gzip
  16
  17
  18 fileConfig((Path.cwd() / 'config') / 'logging.cfg')
  19 logger = getLogger()
  20
  21 CSV_FILE = Path.cwd() / 'config' / 'features' / 'meteofrance_features.csv'
  22
  23
  24 class MeteoFrance(Source):
  25
  26     _latitude    = None
  27     _longitude   = None
  28     _nb_stations = None
  29     _start       = None
  30     _end         = None
  31     _features    = None
  32
  33     def __init__(self, config_file):
  34         '''
  35         Constructor of the MeteoFrance source of feature.
  36
  37         - It will reinitiate the data directory, if asked in the config
  38           features.cfg file.
  39         - It searches for the nb_stations meteo stations closest to the provided
  40           point (longitude and latitude)
  41
  42         For more information about this source of feature, see:
  43     https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32
  44
  45         Parameters:
  46           - in config file:
  47             latitude (float): The latitude from which we want the meteo features.
  48             longitude (float): The longitude from which we want the meteo features.
  49             nb_stations (int): Number of closest stations to consider.
  50           - provided to the constructor
  51             features (list): Weather features that have to be integrated, according
  52                   to their names in meteofrance_features.csv (cf. config directory)
  53
  54         '''
  55         # Check for the integrity of feature names
  56         Source.__init__(self)
  57
  58         self._config = ConfigParser()
  59         self._config.read(config_file)
  60
  61         self._data_directory = (Path.cwd() / 'data') / 'features' / 'meteo_france'
  62
  63         self._dated_features = None
  64
  65         # Re-creating data directory architecture for MeteoFrance, if asked
  66         if self._config['GENERAL'].getboolean('regenerate'):
  67             self._regenerate_directory()
  68
  69         # Collecting the closest meteo station
  70         self._nb_stations = self._config['STATIONS'].getint('nb_stations')
  71         self._stations = self._get_stations()
  72
  73         # Collecting meteofrance features
  74         with open(CSV_FILE, "r") as f:
  75             reader = DictReader(f, delimiter=',')
  76             self._features = [row['name'] for row in reader
  77                               if self._config['FEATURES'].getboolean(row['name'])]
  78
  79
  80     @property
  81     def start(self):
  82         return self._start
  83
  84     @start.setter
  85     def start(self, x):
  86         self._start = x
  87
  88
  89     @property
  90     def end(self):
  91         return self._end
  92
  93     @end.setter
  94     def end(self, x):
  95         self._end = x
  96
  97
  98     @property
  99     def latitude(self):
 100         return self._latitude
 101
 102     @latitude.setter
 103     def latitude(self, x):
 104         self._latitude = x
 105
 106
 107     @property
 108     def longitude(self):
 109         return self._longitude
 110
 111     @longitude.setter
 112     def longitude(self, x):
 113         self._longitude = x
 114
 115
 116     @property
 117     def nb_stations(self):
 118         return self._nb_stations
 119
 120     @nb_stations.setter
 121     def nb_stations(self, x):
 122         self._nb_stations = x
 123
 124
 125     def _regenerate_directory(self):
 126         '''
 127         Re-creating data directory architecture for MeteoFrance
 128         '''
 129         logger.info("Regenerating meteofrance data directory")
 130         try:
 131             rmtree(self._data_directory)
 132         except:
 133             pass
 134         p = Path(self._data_directory / 'historical')
 135         p.mkdir(exist_ok=True, parents=True)
 136         p = Path(self._data_directory / 'config')
 137         p.mkdir(exist_ok=True, parents=True)
 138
 139
 140
 141     def _get_stations(self):
 142         '''
 143         Collect (after downloading them, if needed) the stations and their
 144         locations in a dictionary
 145
 146         Returns:
 147             list: The self._nb_stations closest station IDs, starting by the
 148                   closest one
 149         '''
 150         # The csv file of meteo stations (names, ids and locations) if downloaded,
 151         # if not available in the config directory within data / meteo_france
 152         link = 'https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/postesSynop.csv'
 153         p = Path(self._data_directory / 'config' )
 154         csv_file = p / basename(link)
 155         if not isfile(csv_file):
 156             logger.info('Downloading location stations from MeteoFrance')
 157             urlretrieve(link, csv_file)
 158
 159         # A dictionary for the meteo stations is created
 160         self._dict_stations = {}
 161         logger.info('Collecting information about meteo stations')
 162         with open(csv_file, "r") as f:
 163             reader = DictReader(f, delimiter=';')
 164             for row in reader:
 165                 latitude, longitude = eval(row['Latitude']), eval(row['Longitude'])
 166                 self._dict_stations[row['Nom'].replace("'",'’')] = {
 167                     'id' : row['ID'],
 168                     'longitude' : longitude,
 169                     'latitude' : latitude,
 170                     'distance' : vincenty(
 171                         (self._latitude, self._longitude),
 172                         (latitude, longitude)).km
 173                 }
 174
 175         # Find the closest stations
 176         logger.info('Finding the closest stations')
 177         stations_by_distance = sorted(self._dict_stations.keys(),
 178                                       key = lambda x: self._dict_stations[x]['distance'])
 179         logger.info(f'The {self._nb_stations} closest stations are: '
 180                     f'{", ".join(stations_by_distance[:self._nb_stations])}.')
 181         return [self._dict_stations[sta]['id'] for sta in stations_by_distance][:self._nb_stations]
 182
 183
 184
 185     def _collect_historical_data(self):
 186         '''
 187         We collect all csv files from January 1996 until the month
 188         before now. The argument in the url to download are of the
 189         form 201001 for January 2010. We start by computing all these
 190         patterns, in historical list.
 191         '''
 192         # List of year-months to consider
 193         historical = []
 194         date_end = self._end
 195         for year in range(self._start.year, date_end.year+1):
 196             for month in range(1,13):
 197                 date = datetime(year, month, 1)
 198                 if date >= self._start and date <= date_end:
 199                     historical.append(date.strftime("%Y%m"))
 200
 201         # We download all csv files from meteofrance that are not in
 202         # the data repository
 203         meteo_data = self._data_directory / 'historical'
 204         p = Path(meteo_data)
 205         p.mkdir(exist_ok=True, parents=True)
 206         for date in historical:
 207             if not isfile(meteo_data / ('synop.'+date+'.csv')):
 208                 link = 'https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.'
 209                 link += date + '.csv.gz'
 210                 download_path = meteo_data / basename(link)
 211                 urlretrieve(link, download_path)
 212                 with gzip.open(download_path, 'rb') as f:
 213                     csv_file = meteo_data / basename(link[:-3])
 214                     with open(csv_file, 'w') as g:
 215                         g.write(f.read().decode())
 216                         remove(meteo_data / basename(link))
 217
 218
 219
 220     def update(self):
 221         '''
 222         Update the MeteoFrance features with the last available data
 223         '''
 224         # We collect archive files from MeteoFrance, until the current month
 225         # by using the same method than for data generation : this is currently
 226         # based on the presence of a synop.+date+.csv' file in the
 227         # data/meteo_france/historical directory. The file corresponding to the
 228         # current month is deleted first, so that its most recent version will
 229         # be downloaded by calling self._collect_historical_data
 230
 231         logger.info('Update historical csv files from MeteoFrance, if needed')
 232         today = datetime.now()
 233         todel = 'synop.'+today.strftime("%Y%m")+".csv"
 234         try:
 235             remove(self._data_directory / 'historical' / todel)
 236         except:
 237             logger.warning(f"{self._data_directory / 'historical' / todel} not found")
 238         self._collect_historical_data()
 239
 240
 241
 242     @property
 243     def dated_features(self):
 244         '''
 245         If the attribute dated_features is None, then we create it: a dictionary
 246         with datestamps as keys, and {features: values} as values.
 247          - considered features are the ones from meteofrance_features.csv, found
 248            in config/features/meteofrance directory
 249          - only the closest meteo stations are considered
 250
 251         Returns:
 252             dict: the dictionary of features per datestamp
 253         '''
 254         if self._dated_features == None:
 255             logger.info(f'Collecting meteo feature information from {CSV_FILE}')
 256             # A dictionary for the features
 257             with open(CSV_FILE, "r") as f:
 258                 reader = DictReader(f, delimiter=',')
 259                 dico_features = {row["abbreviation"]:
 260                                    {
 261                                        'name': row['name'], # feature name
 262                                        'type': row['type']  # qualitative (2) or quantitative (1)
 263                                     }
 264                                 for row in reader if row['name'] in self._features}
 265                 #print([row for row in reader])
 266                 #print([row for row in reader if row['name'] in self._features])
 267             dir_data = Path.cwd() / 'data' / 'features' / 'meteo_france' / 'historical'
 268             self._dated_features = {}
 269             for csv_meteo in listdir(dir_data):
 270                 date = datetime.strptime(csv_meteo.split('.')[1], '%Y%m')
 271                 if (date >= self._start and date <= self._end)\
 272                 or (date.year == self._start.year and date.month == self._start.month)\
 273                 or (date.year == self._end.year and date.month == self._end.month):
 274                     logger.info(f'Inserting {csv_meteo} in intervention dictionary')
 275                     with open(dir_data / csv_meteo, "r") as f:
 276                         reader = DictReader(f, delimiter=';')
 277                         for row in reader:
 278                             if row['numer_sta'] in self._stations:
 279                                 date = datetime.strptime(row['date'], '%Y%m%d%H%M%S')
 280                                 if date  >= self._start and date <= self._end:
 281                                     self._dated_features.setdefault(date,{}).update({dico_features[feat]['name']+'_'+str(self._stations.index(row['numer_sta'])): eval(row[feat].replace('mq','None')) for feat in dico_features})
 282         return self._dated_features
 283