diff --git a/mongiris/__init__.py b/mongiris/__init__.py deleted file mode 100644 index 70f0388d609953c032c4804fce7d9712de31e35c..0000000000000000000000000000000000000000 --- a/mongiris/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from .mongiris import Mongiris - -# all modules -__all__ = ['Mongiris', 'config', 'integrator'] - - diff --git a/mongiris/config.py b/mongiris/config.py index ec0614d67014aff07c21a072f6fe24674e564b2d..b6ac1d19ca028b9eea765022273538ef60acfc21 100644 --- a/mongiris/config.py +++ b/mongiris/config.py @@ -4,28 +4,38 @@ # Configuration file # ============================================================================= +from os import path + # mongoDB parameters database_iris = "dbhil" # name of the database -collection_iris = "colliris" # collection containing the ~50,000 IRIS -collection_meta = "collmeta" # collection containing metadata information (eg, dict of indicators) +collection_iris = "colliris" # collection containing the ~50,000 IRIS and values for indicators +collection_indic = "collindic" # collection containing metadata information about indicators (short label, long label) +collection_sources = "collsources" # collection containing metadata information about sources (filepath, title, etc.) max_timeout = 3 # delay before connection timeout +# data parameters +insee_dir = path.join('data', 'insee', '2019-01') # the date indicates the last check for updates of INSEE files + # labels for geojson iris files and for the json indicators dictionary -geojson_indicators_label = 'raw_indicators' -geojson_insee_files_label = 'insee_files' -geojson_shortname_label = 'short_fieldname' -geojson_longname_label = 'long_fieldname' +geojson_raw_indicators_label = 'raw_indicators' geojson_grouped_indicators_label = 'grouped_indicators' +geojson_insee_files_label = 'insee_files' +geojson_shortname_label = 'short_label' +geojson_longname_label = 'full_label' +geojson_from_files_label = 'from_insee_files' -indicators_dictionary_init = {'CODE_IRIS': {"long_fieldname": 'Code IRIS'}, 'NOM_IRIS': {"long_fieldname": 'Nom IRIS'}, - 'INSEE_COM': {"long_fieldname": 'Code postal commune'}, - 'NOM_COM': {"long_fieldname": 'Nom commune'}, - 'TYP_IRIS': {"long_fieldname": 'Type IRIS'}, 'REG': {"long_fieldname": 'Code région'}, - 'DEP': {"long_fieldname": 'Code département'}} -# some properties (found in XLS INSEE files) are actually data about IRIS (and not indicators) -metadata_about_iris = ['DEP', 'REG', 'TRIRIS'] +# the following labels are metadata about an IRIS, not indicators +labels_dictionary_init = {'CODE_IRIS': {"long_fieldname": 'Code IRIS'}, 'NOM_IRIS': {"long_fieldname": 'Nom IRIS'}, + 'INSEE_COM': {"long_fieldname": 'Code postal commune'}, + 'NOM_COM': {"long_fieldname": 'Nom commune'}, 'IRIS': {"long_fieldname": 'Code IRIS'}, + 'TYP_IRIS': {"long_fieldname": 'Type IRIS'}, 'REG': {"long_fieldname": 'Code région'}, + 'DEP': {"long_fieldname": 'Code département'}, 'TRIRIS': {"long_fieldname": 'TRIRIS'}, + 'UU2010': {"long_fieldname": 'Unité urbaine'}, 'LIBIRIS': {"long_fieldname": 'Libellé IRIS'}, + 'GRD_QUART': {"long_fieldname": 'Grand quartier'}, 'LAB_IRIS': {"long_fieldname": 'Label qualité IRIS'}, + 'MODIF_IRIS': {"long_fieldname": 'Type de modification de IRIS'}, + } -list_grouped_indicators = ['logement-resident', 'education-superieur-prive', 'animation-culturel', +labels_grouped_indicators = ['logement-resident', 'education-superieur-prive', 'animation-culturel', 'education-secondaire-cycle1-public', 'education-secondaire-cycle2-professionnel-public', 'animation-commerce-nonalimentaire', 'education-primaire-prive', 'espacevert', 'service-sante', 'service-divers-public', 'education-secondaire-cycle2-professionnel-prive', diff --git a/mongiris/data/insee/action-sociale-2016.xls b/mongiris/data/insee/2019-01/action-sociale-2016.xls similarity index 100% rename from mongiris/data/insee/action-sociale-2016.xls rename to mongiris/data/insee/2019-01/action-sociale-2016.xls diff --git a/mongiris/data/insee/activite-residents-2014.xls b/mongiris/data/insee/2019-01/activite-residents-2014.xls similarity index 100% rename from mongiris/data/insee/activite-residents-2014.xls rename to mongiris/data/insee/2019-01/activite-residents-2014.xls diff --git a/mongiris/data/insee/commerces-2016.xls b/mongiris/data/insee/2019-01/commerces-2016.xls similarity index 100% rename from mongiris/data/insee/commerces-2016.xls rename to mongiris/data/insee/2019-01/commerces-2016.xls diff --git a/mongiris/data/insee/diplomes-formation-2014.xls b/mongiris/data/insee/2019-01/diplomes-formation-2014.xls similarity index 100% rename from mongiris/data/insee/diplomes-formation-2014.xls rename to mongiris/data/insee/2019-01/diplomes-formation-2014.xls diff --git a/mongiris/data/insee/education-colleges-lycees-2016.xls b/mongiris/data/insee/2019-01/education-colleges-lycees-2016.xls similarity index 100% rename from mongiris/data/insee/education-colleges-lycees-2016.xls rename to mongiris/data/insee/2019-01/education-colleges-lycees-2016.xls diff --git a/mongiris/data/insee/education-ecoles-2016.xls b/mongiris/data/insee/2019-01/education-ecoles-2016.xls similarity index 100% rename from mongiris/data/insee/education-ecoles-2016.xls rename to mongiris/data/insee/2019-01/education-ecoles-2016.xls diff --git a/mongiris/data/insee/education-univ-2016.xls b/mongiris/data/insee/2019-01/education-univ-2016.xls similarity index 100% rename from mongiris/data/insee/education-univ-2016.xls rename to mongiris/data/insee/2019-01/education-univ-2016.xls diff --git a/mongiris/data/insee/familles-menages-2014.xls b/mongiris/data/insee/2019-01/familles-menages-2014.xls similarity index 100% rename from mongiris/data/insee/familles-menages-2014.xls rename to mongiris/data/insee/2019-01/familles-menages-2014.xls diff --git a/mongiris/data/insee/fr-en-annuaire-education.geojson b/mongiris/data/insee/2019-01/fr-en-annuaire-education.geojson similarity index 100% rename from mongiris/data/insee/fr-en-annuaire-education.geojson rename to mongiris/data/insee/2019-01/fr-en-annuaire-education.geojson diff --git a/mongiris/data/insee/fr-en-reussite-au-baccalaureat-origine-sociale.json b/mongiris/data/insee/2019-01/fr-en-reussite-au-baccalaureat-origine-sociale.json similarity index 100% rename from mongiris/data/insee/fr-en-reussite-au-baccalaureat-origine-sociale.json rename to mongiris/data/insee/2019-01/fr-en-reussite-au-baccalaureat-origine-sociale.json diff --git a/mongiris/data/insee/insee-geo-ontologie.rdf b/mongiris/data/insee/2019-01/insee-geo-ontologie.rdf similarity index 100% rename from mongiris/data/insee/insee-geo-ontologie.rdf rename to mongiris/data/insee/2019-01/insee-geo-ontologie.rdf diff --git a/mongiris/data/insee/insee-geo-ontologie.ttl b/mongiris/data/insee/2019-01/insee-geo-ontologie.ttl similarity index 100% rename from mongiris/data/insee/insee-geo-ontologie.ttl rename to mongiris/data/insee/2019-01/insee-geo-ontologie.ttl diff --git a/mongiris/data/insee/logement-2014.xls b/mongiris/data/insee/2019-01/logement-2014.xls similarity index 100% rename from mongiris/data/insee/logement-2014.xls rename to mongiris/data/insee/2019-01/logement-2014.xls diff --git a/mongiris/data/insee/medical-para-2016.xls b/mongiris/data/insee/2019-01/medical-para-2016.xls similarity index 100% rename from mongiris/data/insee/medical-para-2016.xls rename to mongiris/data/insee/2019-01/medical-para-2016.xls diff --git a/mongiris/data/insee/mobilite-residentielle-2015.xls b/mongiris/data/insee/2019-01/mobilite-residentielle-2015.xls similarity index 100% rename from mongiris/data/insee/mobilite-residentielle-2015.xls rename to mongiris/data/insee/2019-01/mobilite-residentielle-2015.xls diff --git a/mongiris/data/insee/population-2014.xls b/mongiris/data/insee/2019-01/population-2014.xls similarity index 100% rename from mongiris/data/insee/population-2014.xls rename to mongiris/data/insee/2019-01/population-2014.xls diff --git a/mongiris/data/insee/revenus-declares-2014.xls b/mongiris/data/insee/2019-01/revenus-declares-2014.xls similarity index 100% rename from mongiris/data/insee/revenus-declares-2014.xls rename to mongiris/data/insee/2019-01/revenus-declares-2014.xls diff --git a/mongiris/data/insee/services-2016.xls b/mongiris/data/insee/2019-01/services-2016.xls similarity index 100% rename from mongiris/data/insee/services-2016.xls rename to mongiris/data/insee/2019-01/services-2016.xls diff --git a/mongiris/data/insee/sport-loisirs-2016.csv b/mongiris/data/insee/2019-01/sport-loisirs-2016.csv similarity index 100% rename from mongiris/data/insee/sport-loisirs-2016.csv rename to mongiris/data/insee/2019-01/sport-loisirs-2016.csv diff --git a/mongiris/data/insee/sport-loisirs-2016.xls b/mongiris/data/insee/2019-01/sport-loisirs-2016.xls similarity index 100% rename from mongiris/data/insee/sport-loisirs-2016.xls rename to mongiris/data/insee/2019-01/sport-loisirs-2016.xls diff --git a/mongiris/data/insee/tourisme-transports-2016.xls b/mongiris/data/insee/2019-01/tourisme-transports-2016.xls similarity index 100% rename from mongiris/data/insee/tourisme-transports-2016.xls rename to mongiris/data/insee/2019-01/tourisme-transports-2016.xls diff --git a/mongiris/integrator.py b/mongiris/integrator.py index 06a29e951c94e5348f54b500414f2a7657255b40..09abc06c624e288516a5684c65abfac099adfeb8 100644 --- a/mongiris/integrator.py +++ b/mongiris/integrator.py @@ -6,182 +6,104 @@ # ============================================================================= import os -from . import config -import json_utils -import xls_utils -import mongiris import logging -#from vizliris import regroupement_indicateurs +from mongiris import config +from mongiris import xls_utils +from mongiris import main -#TODO delete this but check for index build - an index function should be created in mongiris -''' -def convert_geojson_files_to_mongo(self, ): - """ - This method should not be used (already run once). Implemented for inserting geojson files into MongoDB. - :return: the number of iris inserted in the collection - """ - import os - from os import path - # print(os.getcwd()) # check working directory, and set it to Hil-quartiers in project settings - path_hil = path.join('..', 'HiL-recommender') - web_dir = path.join(path_hil, 'static', 'data') # from HiL-recommender.config - geojson_integrated_output_departement_dir = path.join(web_dir, 'iris_by_departments') # from HiL-recommender.config - self.logger.info("#documents in collection " + str(self.count_documents(self.iris_collection, {}))) - self.iris_collection.delete_many({}) # empty collection - self.logger.info("#documents in collection " + str(self.count_documents(self.iris_collection, {}))) - for file in os.listdir(geojson_integrated_output_departement_dir): - if file.endswith('.geojson'): # read each geojson (department) file and insert each iris in MongoDB - geojson = self._parse_json_to_dict(path.join(geojson_integrated_output_departement_dir, file)) - try: - # all_iris = geojson['features'] # get a list of iris (geojson dict) - # result = iris_collection.insert_many(all_iris) # pb of 592730103 iris, and many 59 iris not inserted - for iris in geojson['features']: # inserting each iris one by one - if iris["properties"]["CODE_IRIS"] != "592730103": # pb of 592730103 iris in Gravelines - result = self.iris_collection.insert_one(iris) # result.inserted_id - logging.info("Documents inserted for file " + file) - except Exception as e: - self.logger.error('Error with MongoDB connection: ' + str(e)) - nb_docs = self.count_documents(self.iris_collection, {}) - self.logger.info("#documents in collection " + str(nb_docs)) - self.logger.info("Creating index on 'geometry' using " + pymongo.GEOSPHERE) - self.iris_collection.create_index([("geometry", pymongo.GEOSPHERE)]) - self.logger.info("Index created") - assert (nb_docs == 49403), 'Error: expecting 49403 IRIS to be stored in MongoDB, but stored %i' % nb_docs - return nb_docs -''' - - -def build_dictionary_indicators(dict_indicators, new_short_fieldnames, new_long_fieldnames, from_insee_file): - """ - Adds new indicators (code, label and source file) in the dictionary dict_variables - :param dict_indicators: a dict containing information about relevant indicators - :param new_short_fieldnames: indicators codes to be added - :param new_long_fieldnames: indicators full labels to be added - :param from_insee_file: filepath of the INSEE file in which new indicators are extracted - :return: res_dict_indicators: an updated version of dict_variables - {ind1: {label: indicator1, insee_files=[file1, file2], ...}, ind2: {...}, ...} - """ - res_dict_indicators = dict(dict_indicators) - for i in range(0, len(new_short_fieldnames)): - shortname = new_short_fieldnames[i] - if shortname not in res_dict_indicators: - res_dict_indicators[shortname] = dict() - # res_dict_indicators[shortname]["short_fieldname"] = shortname - res_dict_indicators[shortname][config.geojson_longname_label] = new_long_fieldnames[i] - if config.geojson_insee_files_label not in res_dict_indicators[shortname]: - res_dict_indicators[shortname][config.geojson_insee_files_label] = list() - if from_insee_file not in res_dict_indicators[shortname][config.geojson_insee_files_label]: - res_dict_indicators[shortname][config.geojson_insee_files_label].append(from_insee_file) - return res_dict_indicators - - -def integrate_xls_file(iris_dict, indicators_xls): - """ - Integrate IRIS data with indicators (about IRIS) and produce a dict of IRIS. - Careful : indicators are not available for all IRIS, and some indicators concern IRIS #69029ZZZZ (ZZZZ meaning the - neighbourhood, not the IRIS). - :param iris_dict: a dictionary with IRIS data (geojson format) - :param indicators_xls: a csv file path containing INSEE indicators - :return: short_fieldnames: a list containing field ID (or abbreviated field names) - :return: long_fieldnames: a list containing the complete field names - :return: res_iris_dict: a geojson merged dict that integrates both IRIS data and INSEE indicators - """ - short_fieldnames, long_fieldnames, indicators = xls_utils.parse_xls_to_dict(indicators_xls) - ''' # only store relevant indicators (those in config.indicators_ids) - sf = list(short_fieldnames) # need to create a temp list - for field in sf: - if field not in config.indicators_ids: # todo : delete if we store all indicators - index_field = short_fieldnames.index(field) - del short_fieldnames[index_field] - del long_fieldnames[index_field] - for key in indicators.keys(): - del indicators[key][field] - ''' - res_iris_dict = dict(iris_dict) - for key, prop_values in indicators.items(): - for feature in res_iris_dict["features"]: - if key == feature["properties"]["CODE_IRIS"]: # indicator record concerns an iris - if config.geojson_indicators_label not in feature["properties"]: - feature["properties"][config.geojson_indicators_label] = dict() - for prop, value in prop_values.items(): - if prop not in feature["properties"] and prop in config.metadata_about_iris: # adding a metadata - feature["properties"][prop] = value - if prop not in feature["properties"][config.geojson_indicators_label] and prop not in feature["properties"]: - feature["properties"][config.geojson_indicators_label][prop] = value - break - return short_fieldnames, long_fieldnames, res_iris_dict - - -def build_store_index(input_iris_dict, index_output_filepath): - """ - Create an index for iris and store it in a JSON file.s - :param input_iris_dict: a dictionary with IRIS data (geojson format) - :param index_output_filepath: a filepath to the json file in which the index is stored - :return: - """ - index = dict() - nb_iris = len(input_iris_dict["features"]) - for i in range(0, nb_iris): - code_iris = input_iris_dict["features"][i]["properties"]["CODE_IRIS"] - index[code_iris] = i - json_utils.save_dict_to_json(index_output_filepath, index) # store index +def get_all_xlsx_files(): + # generate a list of INSEE xlsx files to be integrated + insee_files = list() + for file in os.listdir(config.insee_dir): + filepath = os.path.join(config.insee_dir, file) + if os.path.isfile(filepath) and filepath.endswith(".xls"): + insee_files.append(filepath) + return insee_files -def integrate_from_to(input_iris_dict, dict_indicators, iris_indicators_output_file): +def integrate_xls_file(xls_file): """ - Main integration program, integrates XLS data from a geojson dict and store the result as a geojson with indicators. - All iris are (possibly) enriched with raw and grouped indicators. - :param input_iris_dict: a dictionary with IRIS data (geojson format) - :param dict_indicators: a dictionary with information about indicators (shortname, longname, etc.) - :param iris_indicators_output_file: output filename for integrated geojson file - :return: nothing :( + Integrate data from the xsl file to update an IRIS and its indicators. + :param xls_file: a csv file path containing INSEE indicators + :return: nothing ;) """ + indicator_metadata, indicators, source_metadata = xls_utils.parse_xls_to_dict(xls_file) - config.logger.info("Integrating raw indicators") - for f in config.indicators_files: # integrate each xlsx INSEE file (both indicators in IRIS and indicators dict) - config.logger.info("Integration of xls INSEE file: " + f) - short_fields, long_fields, input_iris_dict = integrate_xls_file(input_iris_dict, f) - dict_indicators = build_dictionary_indicators(dict_indicators, short_fields, long_fields, f) + # update iris (metadata about the iris and values of its indicators) + ''' + for code_iris, prop_values in indicators.items(): + doc = connexion.get_iris_from_code(code_iris) + if doc is None: + print(f"Oops, doc was not found for iris {code_iris}") + # TODO should a new doc be added? + else: # update the doc + doc_id = doc["_id"] + query_clause = {"_id": doc_id} + dict_updates = {} + for prop, value in prop_values.items(): + if prop in config.labels_dictionary_init: # metadata, not an indicator + dict_updates["properties." + prop] = value + if prop not in config.labels_dictionary_init: # indicator + dict_updates["properties." + config.geojson_raw_indicators_label + "." + prop] = value + update_clause = {"$set": dict_updates} + #print(update_clause) + #connexion.update_one_document(connexion.collection_iris, query_clause, update_clause) + ''' - # all xlsx files have been integrated, computing grouped indicators - config.logger.info("Computing grouped indicators") - dict_grouping_indicators = json_utils.parse_json_to_dict(config.grouping_indicators_file) # how to group raw indicators - for iris in input_iris_dict["features"]: - if config.geojson_indicators_label in iris["properties"]: - grouped_indicators = regroupement_indicateurs.compute_grouped_indicators(iris["properties"] - [config.geojson_indicators_label], dict_grouping_indicators) - iris["properties"][config.geojson_grouped_indicators_label] = grouped_indicators + # add the source metadata + doc = connexion.find_one_document(connexion.collection_sources, {"filename": xls_file}) + if doc is None: + connexion.insert_one_document(connexion.collection_sources, source_metadata) - config.logger.info("Storing output file: " + iris_indicators_output_file) - json_utils.save_dict_to_json(iris_indicators_output_file, input_iris_dict) # store enriched IRIS - # the dictionary has the same name, except it ends with "-dictionnaire.json" instead of "geojson" - dict_output_file = os.path.splitext(iris_indicators_output_file)[0] + "-dictionnaire.json" - config.logger.info("Storing output file: " + dict_output_file) - json_utils.save_dict_to_json(dict_output_file, dict_indicators) # store dict indicators - index_output_file = os.path.splitext(iris_indicators_output_file)[0] + "-index.json" - config.logger.info("Building and storing output file: " + index_output_file) - build_store_index(input_iris_dict, index_output_file) # create and store an index file (code_iris to iris) + # add the indicators labels + for ind in indicator_metadata: + short_name = ind[config.geojson_shortname_label] + doc = connexion.find_one_document(connexion.collection_indic, {config.geojson_shortname_label: short_name}) + if doc is not None: # only update field from_insee_files, $addToSet does not add duplicate values + connexion.update_one_document(connexion.collection_indic, {config.geojson_shortname_label: short_name}, + {"$addToSet": {config.geojson_from_files_label: xls_file}}) + else: # add the document + connexion.insert_one_document(connexion.collection_indic, ind) + return True +######################### +# main integration script +######################### +if __name__ == '__main__': + logging.basicConfig(format='[%(levelname)s] - %(name)s - %(asctime)s : %(message)s') + logger = logging.getLogger() + logger.setLevel(logging.INFO) -####################### -# starting integration -####################### + connexion = main.Mongiris() -logging.basicConfig(format='[%(levelname)s] - %(name)s - %(asctime)s : %(message)s') -logger = logging.getLogger() -logger.setLevel(logging.INFO) + logger.info("Searching xlsx files...") + insee_files = get_all_xlsx_files() # get the list of all xlsx files to be integrated + insee_files = [os.path.join(config.insee_dir, 'population-2014.xls'), os.path.join(config.insee_dir, 'logement-2014.xls')] + logger.info(f'Found {len(insee_files)} xlsx files to be integrated.') -connexion_db = mongiris.Mongiris() -logger.info("Reading dictionary files") -dict_indicators = dict(config.indicators_dictionary_init) # the initial dictionary (labels of main indicators) -logger.info("Reading IRIS geojson file") + logger.info("Initializing dictionary files") + dict_labels = dict(config.labels_dictionary_init) # the initial dictionary (labels of main indicators) + logger.info("Integrating sources files (metadata for source and indicators, data for iris)") + for file in insee_files: # integrate each xlsx INSEE file (both indicators in IRIS and indicators dict) + logger.info(f"\t- INSEE xlsx file: {file}") + integrate_xls_file(file) + + # TODO check why update is soooo long + # TODO update the grouped indicators of each document/iris + ''' + config.logger.info("Computing grouped indicators") + dict_grouping_indicators = json_utils.parse_json_to_dict(config.grouping_indicators_file) # how to group raw indicators + for iris in input_iris_dict["features"]: + if config.geojson_indicators_label in iris["properties"]: + grouped_indicators = regroupement_indicateurs.compute_grouped_indicators(iris["properties"] + [config.geojson_indicators_label], + dict_grouping_indicators) + iris["properties"][config.geojson_grouped_indicators_label] = grouped_indicators + ''' -####################### -# end integration -####################### -logger.info("Done !") + logger.info("Done !") diff --git a/mongiris/mongiris.py b/mongiris/main.py similarity index 85% rename from mongiris/mongiris.py rename to mongiris/main.py index 62a77f62edd1e870dc3401fdb8f53fc525d2e76b..adb2615ae92b4de34ed76a003621f65e602a18df 100755 --- a/mongiris/mongiris.py +++ b/mongiris/main.py @@ -3,6 +3,7 @@ # ============================================================================= # Abstraction layer for the MongoDB database # Performs operations such as find, update, convert_geojson_files, intersect, etc. +# Some methods are not static because they require a valid DB connection (performed in __init__) # ============================================================================= # Path to MongoDB tools (under MacOS): /Applications/MongoDB.app/Contents/Resources/Vendor/mongodb/bin/ # Export et import d'une collection MongoDB (plus rapide, inclut index et métadonnes mais binaire, option --gzip) @@ -20,7 +21,7 @@ import pymongo from bson import json_util # used to convert BSON to JSON (especially ObjectId type of "_id") import json import logging -from . import config +from mongiris import config class Mongiris: @@ -32,7 +33,8 @@ class Mongiris: self.connection = self.init_connection() # default MongoDB connection on 'localhost', 27017 self.database = self.connection[config.database_iris] # database for HiL project self.collection_iris = self.database[config.collection_iris] - self.collection_meta = self.database[config.collection_meta] + self.collection_indic = self.database[config.collection_indic] + self.collection_sources = self.database[config.collection_sources] @staticmethod def bson_to_json(doc_bson): @@ -53,16 +55,24 @@ class Mongiris: self.logger.error('Could not connect to the MongoDB database ! Have you launched MongoDB ? ' + str(e)) return connection - def _parse_json_to_dict(self, json_file_path): + @staticmethod + def _parse_json_to_dict(json_file_path): with open(json_file_path) as data_file: data = json.load(data_file) data_file.close() return data - def _save_dict_to_json(self, json_file_path, dict_geo): + @staticmethod + def _save_dict_to_json(json_file_path, dict_geo): with open(json_file_path, 'w') as data_file: json.dump(dict_geo, data_file) + def create_index(self, iris_collection): + # this method is used in case of restoration/import + self.logger.info("Creating index on 'geometry' using " + pymongo.GEOSPHERE) + iris_collection.create_index([("geometry", pymongo.GEOSPHERE)]) + self.logger.info("Index created") + def count_documents(self, collection, json_query): """ Counts the number of documents that satisfy json_query in the given collection @@ -105,6 +115,36 @@ class Mongiris: doc_json = Mongiris.bson_to_json(random_iris) return doc_json + def update_one_document(self, collection, json_query, json_updates): + """ + Updates the first document found by json_query by setting new values from json_updates + :param collection: the collection to update into + :param json_query: the query criteria + :param json_updates: a json document containing values to be updates (using $set operator) + :return: json_result: an UpdateResult json document containing information about the update + """ + json_result = collection.update_one(json_query, json_updates) + return json_result + + def insert_one_document(self, collection, doc): + """ + Insert a new document in the collection + :param collection: the collection to add in + :param doc: the document to be added + :return: json_result: an InsertOneResult json document containing information about the insertion + """ + json_result = collection.insert_one(doc) + return json_result # eg, the new _id is in json_result.inserted_id + + def delete_all(self, collection): + """ + Delete all document in the collection. Careful + :param collection: the collection to empty + :return: + """ + collection.delete_many({}) # empty collection + return True + def geo_within(self, collection, geometry, json_projection=None): """ Find all documents from given collection and which contain totally the given geometry diff --git a/mongiris/tests/__init__.py b/mongiris/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mongiris/tests/mongoiris_tests.py b/mongiris/tests/mongiris_tests.py similarity index 98% rename from mongiris/tests/mongoiris_tests.py rename to mongiris/tests/mongiris_tests.py index 8a877e30a58bd60908a5a8b2a9660f382dafad94..74f609599fa8dbd40b06dd900839d5989e6c4529 100644 --- a/mongiris/tests/mongoiris_tests.py +++ b/mongiris/tests/mongiris_tests.py @@ -4,15 +4,16 @@ # Unit tests for mongiris # ============================================================================= -import mongiris +from mongiris.main import Mongiris import unittest import random import re + class TestCase(unittest.TestCase): def setUp(self): - self.db = mongiris.Mongiris() + self.db = Mongiris() def test_count(self): count = self.db.count_documents(self.db.collection_iris, {}) diff --git a/mongiris/xls_utils.py b/mongiris/xls_utils.py index 59c6c66ddd1e335a13edfa83e70fdb825c40f519..87450f8c4e95950c40d7664a75e170ebdf18bd54 100755 --- a/mongiris/xls_utils.py +++ b/mongiris/xls_utils.py @@ -1,15 +1,14 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# encoding: utf-8 # ============================================================================= # XLS utilities, mainly for INSEE indicators (parsing Excel files, etc.) and mobilipass data from HiL # https://pypi.python.org/pypi/openpyxl/ # https://xlrd.readthedocs.io/en/latest/api.html # ============================================================================= -from vizliris import config -import unittest import sys -import xlrd # for old Excel files (.xls) and new files (.xlsx Excel 2010) +from mongiris import config +import xlrd # for old Excel files (.xls) and new files (.xlsx Excel 2010) #import openpyxl # for new Excel files (.xlsx Excel 2010) @@ -17,25 +16,45 @@ def parse_xls_to_dict(xls_file_path): """ Parse an XLS file (excel/calc) produced by INSEE and containing indicators about IRIS. :param xls_file_path: the path to the XLS file to be parsed - :return: short_fieldnames: a list containing field ID (or abbreviated field names) - :return: long_fieldnames: a list containing the complete field names - :return: indicators: a dictionary such as {id_iris1: {ind1: val1, ind2: val2, ...}, id_iris2: {ind1: val1, ind2: val2, ...}, ...} + :return: indicator_metadata: a list of dict containing information about indicator labels (shortname, fullname) + :return: indicators: a dictionary such as {id_iris1: {ind1: val1, ind2: val2, ...}, id_iris2: {ind1: val1, ...}, ...} + :return: source_metadata: a dictionary containing metadata information about the document (title, filepath, etc.) """ - logger = config.logging.getLogger(__name__) indicators = dict() + source_metadata = dict() # metadata about the document (name, date mise en ligne, infoGeo, etc.) + indicator_metadata = list() # list of dict, each containing metadata about an indicator (short and full labels) try: wb = xlrd.open_workbook(xls_file_path, ragged_rows=True) # ragged_rows to True to avoid empty cells at the end of rows) except Exception as e: - sys.exit('Error while parsing XLS file {}: {}'.format(xls_file_path, e)) - #sheet = wb.sheet_by_name("IRIS") # data is stored in the sheet "IRIS", else wb.sheet_names() + sys.exit(f'Error while parsing xlsx file {xls_file_path}: {e}') + # sheet = wb.sheet_by_name("IRIS") # data is stored in the sheet "IRIS", else wb.sheet_names() sheet = wb.sheet_by_index(0) # sheet are sometimes called IRIS, also IRIS_DEC + + # extracting source metadata + source_metadata['filepath'] = xls_file_path # filepath of the document + source_metadata['title'] = sheet.cell_value(0, 0) # title of the document + source_metadata['infoGeo'] = sheet.cell_value(1, 0) # geographic information (area + level of granularity) + cell_dates = sheet.cell_value(2, 0) + if cell_dates.startswith("Mise en ligne le "): + cell_dates = cell_dates[17:27] + source_metadata['datePublication'] = cell_dates # date of online publication + + # extracting labels/fieldnames long_fieldnames = sheet.row_values(4) # row 4 contains the long labels short_fieldnames = sheet.row_values(5) # row 5 contains the short labels (usually not meaningful) + for i in range(0, len(short_fieldnames)): + shortname = short_fieldnames[i] + longname = long_fieldnames[i] + ind_dict = {config.geojson_shortname_label: shortname, config.geojson_longname_label: longname, + config.geojson_from_files_label: [xls_file_path]} + indicator_metadata.append(ind_dict) + + # extracting indicators values nb_fields = len(short_fieldnames) for i in range(6, sheet.nrows, 1): iris_id = sheet.cell_value(i, 0) # IRIS id is in the first column - #print(sheet.row_values(i)) + # print(sheet.row_values(i)) if sheet.row_len(i) == nb_fields: # some rows may not include all fields if iris_id not in indicators: indicators[iris_id] = dict() @@ -44,12 +63,11 @@ def parse_xls_to_dict(xls_file_path): val = sheet.cell_value(i, j) indicators[iris_id][field] = val else: - logger.warning("Ignored row (missing fields) : " + str(sheet.row_values(i))) - return short_fieldnames, long_fieldnames, indicators + print("\tIgnored row (missing fields) : " + str(sheet.row_values(i))) + return indicator_metadata, indicators, source_metadata -def parse_data_HiL_to_dict(xls_file_path): - logger = config.logging.getLogger(__name__) +def parse_data_mobilipass_to_dict(xls_file_path): data = dict() try: wb = xlrd.open_workbook(xls_file_path, @@ -69,31 +87,6 @@ def parse_data_HiL_to_dict(xls_file_path): val = sheet.cell_value(i, j) data[person_id][field] = val else: - logger.warning("Ignored row: " + str(sheet.row_values(i))) + print("Ignored row: " + str(sheet.row_values(i))) return long_fieldnames, data - -class TestCase(unittest.TestCase): - - def atest_parse_xls_to_dict(self): - f = config.indicators_files[0] # careful, no guarantee that the first file is always the same (new or deleted ones) - _, _, records = parse_xls_to_dict(f) - assert (len(records) == 14089), 'Error: expecting 14089 IRIS, extracted %i' % len(records) - - def atest_parse_all_xls_to_dict(self): - for f in config.indicators_files: - print("Parsing: " + f) - parse_xls_to_dict(f) - print("Done !") - - def test_parse_HiL_mobilipass(self): - f = config.xls_data_HiL_file_path - l, d = parse_data_HiL_to_dict(f) - print(l) - #print(d) - - -if __name__ == "__main__": - unittest.main(verbosity=2) # run all tests with verbose mode - - diff --git a/setup.cfg b/setup.cfg index f28c68b3c4c918bf1daf48e367d65c3a6b3243c8..00bc165c3bf84d5f46d48790a09e91972dc14af9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] name = mongiris -version = 0.21 +version = 0.3 description = This package is an interface for querying INSEE IRIS stored as documents in MongoDB. Requires loading the IRIS files into MongoDB prior to using this package. author = Fabien Duchateau author_email = fabien.duchateau@univ-lyon1.fr @@ -15,6 +15,7 @@ include_package_data = True zip_safe = False install_requires = pymongo >= 3.7.2 + xlrd >= 1.2.0 [options.packages.find] exclude =