diff --git a/build/lib/mongiris/__init__.py b/build/lib/mongiris/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b2918629b05efcf4a3869f258045f46f25a8e1c --- /dev/null +++ b/build/lib/mongiris/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +# encoding: utf-8 + +from .api import Mongiris + +# all modules +__all__ = ['Mongiris', ] + diff --git a/build/lib/mongiris/api.py b/build/lib/mongiris/api.py new file mode 100644 index 0000000000000000000000000000000000000000..5ddf4e308af50577ba8e7eb66d05d7d2036ec878 --- /dev/null +++ b/build/lib/mongiris/api.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python +# encoding: utf-8 +# ============================================================================= +# Abstraction layer for the MongoDB database +# Performs operations such as find, update, convert_geojson_files, intersect, etc. +# Some methods are not static because they require a valid DB connection (performed in __init__) +# ============================================================================= + +import pymongo +from bson import json_util # used to convert BSON to JSON (especially ObjectId type of "_id") +import json +import logging +from mongiris import config + + +class Mongiris: + """ + The Mongiris class is an API to manipulate data from the MongoDB `dbinsee` datatase. + + Several methods accepts a 'collection' parameter for flexibility (i.e., be able to query other collections than the + iris collection). + Most methods convert resulting documents from the BSON format (MongoDB) to JSON. This mainly avoids the issue of + ObjectId type (for MongoDB '_id' field). + + The constructor initializes the logger and it automatically connects to the database. + The name of the database and of the three collections + are based on the default names (the ones from the dump). If the names are changed in MongoDB, they should be changed + in the `config.py` file. + + Examples of usages, including testing geospatial queries, are available in `tests/api_tests.py`. + + An example of IRIS following the GeoJSON format is provided in `data/example-iris.json`. + + Additional resources: + + - [MongoDB documentation](http://www.mongodb.org/) + + - [GeoJSON format specifications](https://geojson.org/) + + - [pymongo API](http://api.mongodb.com/python/current/) between Python and MongoDB + + """ + + def __init__(self): + logging.basicConfig(format='[%(levelname)s] - %(name)s - %(asctime)s : %(message)s') + self.logger = logging.getLogger() + self.logger.setLevel(logging.INFO) + self.connection, self.connection_status = self.init_connection() # default connection on 'localhost', 27017 + self.database = self.connection[config.database_iris] # database for HiL project + self.collection_iris = self.database[config.collection_iris] + self.collection_indic = self.database[config.collection_indic] + self.collection_sources = self.database[config.collection_sources] + + @staticmethod + def bson_to_json(doc_bson): + """ + Converts the bson data to valid JSON (including the ObjectId type converted to {"$oid": <string>}). + + Args: + doc_bson: the BSON data to be converted + + Returns: + doc_json: a JSON object + """ + doc_json = json.loads(json_util.dumps(doc_bson, json_options=json_util.RELAXED_JSON_OPTIONS)) + return doc_json + + def init_connection(self): + """ + Tries to connect to MongoDB. The output connection object do not provide reliable connection status, only the + boolean connection_status indicates whether the connection is working or not. + + Returns: + connection: an object with information about the connection + connection_status: a boolean indicating whether the connection is a success or a failure + """ + connection_status = True # by default, connection is ok + connection = None + try: + connection = pymongo.MongoClient(serverSelectionTimeoutMS=config.max_timeout) # default localhost:27017 + connection.server_info() # forces a query to MongoDB (for checking the connection) + except pymongo.errors.ServerSelectionTimeoutError as e: + self.logger.error('Could not connect to the MongoDB database ! Have you launched MongoDB ? ' + str(e)) + connection_status = False + return connection, connection_status + + @staticmethod + def _parse_json_to_dict(json_file_path): + """ Converts a JSON file denoted by json_file_path into a Python dictionary. """ + with open(json_file_path) as data_file: + data = json.load(data_file) + data_file.close() + return data + + @staticmethod + def _save_dict_to_json(json_file_path, dict_geo): + """ Converts and saves a Python dictionary into a JSON file denoted by json_file_path. """ + with open(json_file_path, 'w') as data_file: + json.dump(dict_geo, data_file) + + def create_index(self, iris_collection): + """ Rebuilds a geospatial index on the iris collection. Only used in case of restoration/import. """ + self.logger.info("Creating index on 'geometry' using " + pymongo.GEOSPHERE) + iris_collection.create_index([("geometry", pymongo.GEOSPHERE)]) + self.logger.info("Index created") + + def count_documents(self, collection, json_query): + """ + Counts the number of documents that satisfy json_query in the given collection. + + Args: + collection: a string representing the collection name + json_query: a dict containing the query criteria (https://docs.mongodb.com/manual/tutorial/query-documents/) + + Returns: + count: an integer representing the number of documents + """ + return collection.count_documents(json_query) + + def find_one_document(self, collection, json_query): + """ + Finds the first document in the given collection that satisfies json_query. + + Args: + collection: a string representing the collection name + json_query: a dict containing the query criteria (https://docs.mongodb.com/manual/tutorial/query-documents/) + + Returns: + doc_json: a dictionary representing an iris, or None + """ + doc = collection.find_one(json_query) + doc_json = Mongiris.bson_to_json(doc) + return doc_json + + def get_iris_from_code(self, code_iris): + """ + Returns the iris identified by the given code_iris. + + Args: + code_iris: a string containing the code of the searched iris + + Returns: + iris: a dictionary representing an iris, or None + """ + iris = self.find_one_document(self.collection_iris, {"properties.CODE_IRIS": code_iris}) + return iris + + def find_documents(self, collection, json_query, json_projection=None): + """ + Finds the first document in the given collection that satisfies json_query. + + Args: + collection: a string representing the collection name + json_query: a dict containing the query criteria (https://docs.mongodb.com/manual/tutorial/query-documents/) + json_projection: a json document indicating the fields that appear in the results + + Returns: + doc_json: a cursor (set of documents) + """ + cursor = collection.find(json_query, json_projection) + doc_json = Mongiris.bson_to_json(cursor) + return doc_json + + def find_all(self, collection): + """ + Finds all the elements in the given collection. + + Args: + collection: an instance of Collection + + Returns: + all_elements: all the elements in the given collection + """ + all_elements = collection.find() + return all_elements + + def get_random_document(self, collection): + """ Returns a random document from the given collection. """ + random_iris = collection.aggregate([{"$sample": {"size": 1}}]).next() + doc_json = Mongiris.bson_to_json(random_iris) + return doc_json + + def update_one_document(self, collection, json_query, json_updates): + """ + Updates the first document satisfying json_query by setting new values from json_updates. + + Args: + collection: a string representing the collection name + json_query: a dict containing the query criteria (https://docs.mongodb.com/manual/tutorial/query-documents/) + json_updates: a json document containing values to be updates (using $set operator) + + Returns: + json_result: an UpdateResult json document containing information about the operation + """ + json_result = collection.update_one(json_query, json_updates) + return json_result + + def replace_one_document(self, collection, json_query, json_replace_doc, upsert=False): + """ + Replaces the first document satisfying json_query by the document json_replace_doc (their _id are identical). + + Args: + collection: a string representing the collection name + json_query: a dict containing the query criteria (https://docs.mongodb.com/manual/tutorial/query-documents/) + json_replace_doc: the replacement doc (if _id set, should be the same _id as the doc matching json_query) + upsert: a boolean, whether the json_replace_doc should be inserted if no document match json_query + + Returns: + json_result: an UpdateResult json document containing information about the operation + """ + json_result = collection.replace_one(json_query, json_replace_doc, upsert) + return json_result + + def insert_one_document(self, collection, doc): + """ + Inserts a new document in the collection. + + Args: + collection: a string representing the collection name + doc: a dict representing the document to be added + + Returns: + json_result: an InsertOneResult json document containing information about the operation + """ + json_result = collection.insert_one(doc) + return json_result # eg, the new _id is in json_result.inserted_id + + def delete_all(self, collection): + """ + Deletes all documents in the collection. + + Args: + collection: a string representing the collection name + + Returns: + json_result: an DeleteResult json document containing information about the operation + """ + json_result = collection.delete_many({}) # empty collection + return json_result + + def geo_within(self, collection, geometry, json_projection=None): + """ + Finds all documents from given collection and which contain totally the given geometry. + Cannot be used to find the IRIS containing a point (geometry must be a polygon). + + Args: + collection: a string representing the collection name + geometry: a geojson geometry ("Polygon", "$box" or "MultiPolygon", NO "Point") + json_projection: a json document indicating the fields that appear in the results + + Returns: + doc_json: a cursor (set of documents) + """ + cursor = self.find_documents(collection, {"geometry": {"$geoWithin": {"$geometry": geometry}}}, json_projection) + doc_json = Mongiris.bson_to_json(cursor) + return doc_json + + def geo_within_sphere(self, collection, sphere, json_projection=None): + """ + Finds all documents from given collection and which contain totally the given sphere. + Cannot be used to find the IRIS containing a point (geometry must be a polygon, with min. 3 points). + + Args: + collection: a string representing the collection name + sphere: a geojson geometry defined by a center and a radius in radians + json_projection: a json document indicating the fields that appear in the results + + Returns: + doc_json: a cursor (set of documents) + """ + cursor = self.find_documents(collection, {"geometry": {"$geoWithin": sphere}}, json_projection) + doc_json = Mongiris.bson_to_json(cursor) + return doc_json + + def intersect(self, collection, geometry, json_projection=None): + """ + Finds all documents from given collection and which intersect the given geometry. + + Args: + collection: a string representing the collection name + geometry: a geojson geometry + json_projection: a json document indicating the fields that appear in the results + + Returns: + doc_json: a cursor (set of documents) + """ + cursor = self.find_documents(collection, {"geometry": {"$geoIntersects": { + "$geometry": geometry}}}, json_projection) + doc_json = Mongiris.bson_to_json(cursor) + return doc_json + + @staticmethod + def get_geojson_point(coordinates): + """ + Builds a dictionary with GeoJSON syntax for a point using the given coordinates. + + Args: + coordinates: the coordinates (long, lat) as a list, e.g. [4.8, 45.7] + + Returns: + point: a dictionary with GeoJSON syntax for a Point + """ + return {"type": "Point", "coordinates": coordinates} + + @staticmethod + def convert_geojson_box_to_polygon(lng1, lat1, lng2, lat2): + """ + Builds a dictionary with GeoJSON syntax for a polygon using two coordinates (points south-west and north-east). + This method builds the polygon by adding 2 missing points (north-west and south-east) and adds the starting + point (south-west) to end the loop. + The MongoDB $box operator is not supported with 2d-spherical indexes. + + Args: + lng1: longitude of the first point (south-west) of the box + lat1: latitude of the first point (south-west) of the box + lng2: longitude of the second point (north-east) of the box + lat2: latitude of the second point (north-east) of the box + + Returns: + box: a dictionary with GeoJSON syntax for a Box + """ + coordinates = [[[lng1, lat1], [lng1, lat2], [lng2, lat2], [lng2, lat1], [lng1, lat1]]] + return Mongiris.get_geojson_polygon(coordinates) + + @staticmethod + def get_geojson_polygon(coordinates): + """ + Builds a dictionary with GeoJSON syntax for a polygon using the given coordinates. + Careful: polygons must be closed ! (first coordinate must be identical to last coordinate) + + Args: + coordinates: the coordinates (long, lat) as a list of list, e.g. [[[4.8, 45.7], [4.9, 47.8]]] + + Returns: + polygon: a dictionary with GeoJSON syntax for a Polygon + """ + return {"type": "Polygon", "coordinates": coordinates} + + def point_in_which_iris(self, coordinates, json_projection=None): + """ + Finds the document (IRIS) containing the given coordinates. Uses near() since geo_within() requires a Polygon. + Careful: the near() operator may return several iris (low probability since distance = 1 meter) and only the + first one is returned. + + Args: + coordinates: an array of coordinates (long, lat) + json_projection: a json document indicating the fields that appear in the results + + Returns: + doc_json: a json document or None + """ + results = self.near(self.collection_iris, coordinates, json_projection, 1) # distance = 1 meter + if len(results) == 0: + return None + return results[0] + + def near(self, collection, coordinates, json_projection=None, distance_max=2000): + """ + Finds all documents from given collection and which are near the given geometry (according to distance_max). + + Args: + collection: a string representing the collection name + coordinates: an array of coordinates (long, lat) - near only accepts Point + json_projection: a json document indicating the fields that appear in the results + distance_max: the maximum distance of resulting iris, in meters + + Returns: + doc_json: a cursor (set of documents) + """ + geometry = Mongiris.get_geojson_point(coordinates) + cursor = self.find_documents(collection, {"geometry": {"$near": { + "$geometry": geometry, "$maxDistance": distance_max}}}, json_projection) + doc_json = Mongiris.bson_to_json(cursor) + return doc_json + + @staticmethod + def adjacent(collection, geometry, json_projection=None, distance=20, exclude_geometry_iris=None): + """ + Finds all adjacent neighbors of an iris represented by geometry. + No adjacent function, so use all coordinates of an iris and find the closest iris (according to distance). + Could be done directly with near(), but near() is less accurate and thus incomplete. + + Args: + collection: a string representing the collection name + geometry: a geojson geometry (Point, Polygon, etc.) + json_projection: a json document indicating the fields that appear in the results + distance: the maximum distance for an adjacent neighbour, in meters (10 to 50 meters are fine) + exclude_geometry_iris: the document _id of the iris represented by geometry, if it needs to be excluded + + Returns: + doc_json: a cursor (set of documents) + """ + results = list() + results_ids = list() + if exclude_geometry_iris is not None: # to exclude the iris represented by geometry, add it to the results ids + results_ids.append(exclude_geometry_iris) + for coords in geometry["coordinates"][0]: + geometry_coords = Mongiris.get_geojson_point(coords) + cursor = collection.find({"geometry": {"$near": {"$geometry": geometry_coords, + "$maxDistance": distance}}}, json_projection) + for doc in cursor: + doc_id = doc["_id"] + if doc_id not in results_ids: # add the new adjacent iris if not already in the results + results.append(doc) + results_ids.append(doc_id) + doc_json = Mongiris.bson_to_json(results) + return doc_json + + +if __name__ == "__main__": + print("Run unit tests for testing the Mongiris class.") diff --git a/build/lib/mongiris/config.py b/build/lib/mongiris/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4fffd75edc8785f2ba606341c87056bc87c56328 --- /dev/null +++ b/build/lib/mongiris/config.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# encoding: utf-8 +# ============================================================================= +# Configuration file +# ============================================================================= + +from os import path + +# mongoDB parameters +database_iris = "dbinsee" # name of the database +collection_iris = "colliris" # collection containing the ~50,000 IRIS and values for indicators +collection_indic = "collindic" # collection containing metadata information about indicators (short label, long label) +collection_sources = "collsources" # collection containing metadata information about sources (filepath, title, etc.) +max_timeout = 3 # delay before connection timeout + +# data parameters +insee_dir = path.join('data', 'insee', '2019-01') # the date indicates the last check for updates of INSEE files + +# labels for geojson iris files and for the json indicators dictionary +geojson_raw_indicators_label = 'raw_indicators' +geojson_grouped_indicators_label = 'grouped_indicators' +geojson_insee_files_label = 'insee_files' +geojson_shortname_label = 'short_label' +geojson_longname_label = 'full_label' +geojson_from_files_label = 'from_insee_files' + +# the following labels are metadata about an IRIS, not indicators +labels_dictionary_init = {'CODE_IRIS': {"long_fieldname": 'Code IRIS'}, 'NOM_IRIS': {"long_fieldname": 'Nom IRIS'}, + 'INSEE_COM': {"long_fieldname": 'Code postal commune'}, + 'NOM_COM': {"long_fieldname": 'Nom commune'}, 'IRIS': {"long_fieldname": 'Code IRIS'}, + 'TYP_IRIS': {"long_fieldname": 'Type IRIS'}, 'REG': {"long_fieldname": 'Code région'}, + 'DEP': {"long_fieldname": 'Code département'}, 'TRIRIS': {"long_fieldname": 'TRIRIS'}, + 'UU2010': {"long_fieldname": 'Unité urbaine'}, 'LIBIRIS': {"long_fieldname": 'Libellé IRIS'}, + 'GRD_QUART': {"long_fieldname": 'Grand quartier'}, 'LAB_IRIS': {"long_fieldname": 'Label qualité IRIS'}, + 'MODIF_IRIS': {"long_fieldname": 'Type de modification de IRIS'}, + } + +# a list of grouped indicators +labels_grouped_indicators = ['logement-resident', 'education-superieur-prive', 'animation-culturel', + 'education-secondaire-cycle1-public', 'education-secondaire-cycle2-professionnel-public', + 'animation-commerce-nonalimentaire', 'education-primaire-prive', 'espacevert', + 'service-sante', 'service-divers-public', 'education-secondaire-cycle2-professionnel-prive', + 'education-secondaire-cycle2-general-public', 'education-creche', 'animation-divertissement', + 'animation-commerce-alimentaire-proximite', 'csp', 'service-divers-prive', + 'animation-commerce-alimentaire-grandesurface', 'logement-type', + 'education-secondaire-cycle2-general-prive', 'transport-longuedistance', + 'transport-busmetrotram', 'logement-annee', 'transport-velo', 'logement-residence', + 'securite', 'education-secondaire-cycle1-prive', 'loisir', 'education-primaire-public', + 'service-justice', 'service-emploi', 'education-superieur-public', 'service-actionsociale', ] + + + + + diff --git a/build/lib/mongiris/integrator.py b/build/lib/mongiris/integrator.py new file mode 100644 index 0000000000000000000000000000000000000000..78ed3a7118729ce9ffe296ea61ff0339c972f31c --- /dev/null +++ b/build/lib/mongiris/integrator.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# encoding: utf-8 +# ============================================================================= +# Integrator: perform the integration of different data sources to produce MongoDB documents (GeoJSON format) containing +# IRIS information (~640 or ~350 INSEE indicators, depending on the IRIS). +# This file does not need to be run because the MongoDB dump is provided (unless for integrating new data sources). +# ============================================================================= +# Path to MongoDB tools (under MacOS): /Applications/MongoDB.app/Contents/Resources/Vendor/mongodb/bin/ +# Export and import a local database (metadata/indexes included, option dryRun for testing, several files): +# mongodump --db dbinsee --verbose -o dump/ +# mongorestore --db dbinsee --dir dump/dbinsee --verbose --dryRun +# Export and import a local database (metadata/indexes included, option dryRun for testing, single binary file): +# mongodump --db dbinsee --verbose --archive=dbinsee.bin +# mongorestore --archive=dbinsee.bin --verbose +# Export and import in JSON: +# ./mongoexport --db dbinsee --out ~/dump-iris.json +# ./mongoimport --db dbinsee --file ~/dump-iris.json +# ============================================================================= + +import os +import logging +from mongiris import config +from mongiris import xls_utils +from mongiris import api + + +def get_all_xlsx_files(dir_sources): + """ + Returns a list of spreadsheets files to be integrated. + + Args: + dir_sources: path to a directory containing data sources (spreadsheets from INSEE) + + Returns: + insee_files: a list of filepaths (to xls files) + + """ + insee_files = list() + for file in os.listdir(dir_sources): + filepath = os.path.join(config.insee_dir, file) + if os.path.isfile(filepath) and filepath.endswith(".xls"): + insee_files.append(filepath) + return insee_files + + +def integrate_xls_file(connection, xls_file): + """ + Integrates data from an xsl file : the 3 collections may be updated. First, extracts information about the source + (e.g., title, date) and inserts a document for that source in 'collsources'. Next extracts information about the + indicators (e.g., short name, long name) and updates/inserts a document in 'collindic'. Finally for each line (IRIS) + in the spreadsheet, updates corresponding document with new indicators. + + Args: + connection: an object representing the database connection + xls_file: a csv file path containing INSEE indicators + """ + indicator_metadata, indicators, source_metadata = xls_utils.parse_xls_to_dict(xls_file) + + # add the source metadata into collection_sources + doc = connection.find_one_document(connection.collection_sources, {"filename": xls_file}) + if doc is None: + connection.insert_one_document(connection.collection_sources, source_metadata) + + # add the indicators information into collection_indic + for ind in indicator_metadata: + short_name = ind[config.geojson_shortname_label] + doc = connection.find_one_document(connection.collection_indic, {config.geojson_shortname_label: short_name}) + if doc is not None: # only update field from_insee_files, $addToSet does not add duplicate values + connection.update_one_document(connection.collection_indic, {config.geojson_shortname_label: short_name}, + {"$addToSet": {config.geojson_from_files_label: xls_file}}) + else: # add the document + connection.insert_one_document(connection.collection_indic, ind) + + # add the indicators values into collection_iris + not_found_iris = found_iris = nb_replacements = 0 + for code_iris, indics in indicators.items(): + doc = connection.get_iris_from_code(code_iris) + if doc is None: + not_found_iris += 1 + else: + doc_id = doc['properties']['CODE_IRIS'] + found_iris += 1 + need_replacement = False + doc_changes = {"$set": dict()} # an update document containing the new fields + for ind, value in indics.items(): + if ind in config.labels_dictionary_init: # main indicator, do not replace it if already there + if ind not in doc['properties']: # general information (code iris, city name, etc.) + doc_changes["$set"]["properties." + ind] = value + need_replacement = True + else: # raw indicator + if ind not in doc['properties'][config.geojson_raw_indicators_label]: # add this value + doc_changes["$set"]["properties." + config.geojson_raw_indicators_label + "." + ind] = value + need_replacement = True + + if need_replacement: # replace the old doc by new doc + res = connection.update_one_document(connection.collection_iris, {'properties.CODE_IRIS': doc_id}, doc_changes) + nb_replacements += res.modified_count + + logger.info(f"\t\t{xls_file}: {found_iris} found iris, {not_found_iris} not found iris, {nb_replacements} updates") + return True + + +def integrate(connection, dir_sources): + """ + Main integration method: parses a directory of data sources, and call the `integrate_xls_file` function for each + data source. + + Args: + connection: an object representing the database connection + dir_sources: path to a directory containing data sources (spreadsheets from INSEE) + """ + logger.info("Searching xlsx files...") + insee_files = get_all_xlsx_files(dir_sources) # get the list of all xlsx files to be integrated + logger.info(f"Found a total of {len(insee_files)} xlsx files to be integrated.") + # insee_files = [] # roughly 10-20 mins execution time for one data source + logger.info(f'Selected {len(insee_files)} xlsx files to be integrated.') + + logger.info("Integrating sources files (metadata for source and indicators, data for iris)") + for file in insee_files: # integrate each xlsx INSEE file (both indicators in IRIS and indicators dict) + logger.info(f"\t- INSEE xlsx file: {file}") + integrate_xls_file(connection, file) + logger.info("Done !") + + +def check_properties(connection): + """ + Computes and prints statistics on the IRIS collection, mainly the number of indicators per document. + Args: + connection: an object representing the database connection + """ + docs = connection.find_documents(connection.collection_iris, {}, ) + counts = dict() + for doc in docs: + nb = len(doc['properties'][config.geojson_raw_indicators_label]) + if nb not in counts: + counts[nb] = 0 + counts[nb] += 1 + print(counts) + + +######################### +# main integration script +######################### + +if __name__ == '__main__': + + logging.basicConfig(format='[%(levelname)s] - %(name)s - %(asctime)s : %(message)s') + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + connection = api.Mongiris() # database connection + integrate(connection, config.insee_dir) # integrating data sources + check_properties(connection) # stats about properties 5min execution {350: 36530, 638: 11738, 615: 1057, 373: 79} + diff --git a/build/lib/mongiris/xls_utils.py b/build/lib/mongiris/xls_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..87450f8c4e95950c40d7664a75e170ebdf18bd54 --- /dev/null +++ b/build/lib/mongiris/xls_utils.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# encoding: utf-8 +# ============================================================================= +# XLS utilities, mainly for INSEE indicators (parsing Excel files, etc.) and mobilipass data from HiL +# https://pypi.python.org/pypi/openpyxl/ +# https://xlrd.readthedocs.io/en/latest/api.html +# ============================================================================= + +import sys +from mongiris import config +import xlrd # for old Excel files (.xls) and new files (.xlsx Excel 2010) +#import openpyxl # for new Excel files (.xlsx Excel 2010) + + +def parse_xls_to_dict(xls_file_path): + """ + Parse an XLS file (excel/calc) produced by INSEE and containing indicators about IRIS. + :param xls_file_path: the path to the XLS file to be parsed + :return: indicator_metadata: a list of dict containing information about indicator labels (shortname, fullname) + :return: indicators: a dictionary such as {id_iris1: {ind1: val1, ind2: val2, ...}, id_iris2: {ind1: val1, ...}, ...} + :return: source_metadata: a dictionary containing metadata information about the document (title, filepath, etc.) + """ + indicators = dict() + source_metadata = dict() # metadata about the document (name, date mise en ligne, infoGeo, etc.) + indicator_metadata = list() # list of dict, each containing metadata about an indicator (short and full labels) + try: + wb = xlrd.open_workbook(xls_file_path, + ragged_rows=True) # ragged_rows to True to avoid empty cells at the end of rows) + except Exception as e: + sys.exit(f'Error while parsing xlsx file {xls_file_path}: {e}') + # sheet = wb.sheet_by_name("IRIS") # data is stored in the sheet "IRIS", else wb.sheet_names() + sheet = wb.sheet_by_index(0) # sheet are sometimes called IRIS, also IRIS_DEC + + # extracting source metadata + source_metadata['filepath'] = xls_file_path # filepath of the document + source_metadata['title'] = sheet.cell_value(0, 0) # title of the document + source_metadata['infoGeo'] = sheet.cell_value(1, 0) # geographic information (area + level of granularity) + cell_dates = sheet.cell_value(2, 0) + if cell_dates.startswith("Mise en ligne le "): + cell_dates = cell_dates[17:27] + source_metadata['datePublication'] = cell_dates # date of online publication + + # extracting labels/fieldnames + long_fieldnames = sheet.row_values(4) # row 4 contains the long labels + short_fieldnames = sheet.row_values(5) # row 5 contains the short labels (usually not meaningful) + for i in range(0, len(short_fieldnames)): + shortname = short_fieldnames[i] + longname = long_fieldnames[i] + ind_dict = {config.geojson_shortname_label: shortname, config.geojson_longname_label: longname, + config.geojson_from_files_label: [xls_file_path]} + indicator_metadata.append(ind_dict) + + # extracting indicators values + nb_fields = len(short_fieldnames) + for i in range(6, sheet.nrows, 1): + iris_id = sheet.cell_value(i, 0) # IRIS id is in the first column + # print(sheet.row_values(i)) + if sheet.row_len(i) == nb_fields: # some rows may not include all fields + if iris_id not in indicators: + indicators[iris_id] = dict() + for j in range (0, nb_fields): + field = short_fieldnames[j] # + " - " + long_fieldnames[j] + val = sheet.cell_value(i, j) + indicators[iris_id][field] = val + else: + print("\tIgnored row (missing fields) : " + str(sheet.row_values(i))) + return indicator_metadata, indicators, source_metadata + + +def parse_data_mobilipass_to_dict(xls_file_path): + data = dict() + try: + wb = xlrd.open_workbook(xls_file_path, + ragged_rows=True) # ragged_rows to True to avoid empty cells at the end of rows) + except Exception as e: + sys.exit('Error while parsing XLS file {}: {}'.format(xls_file_path, e)) + sheet = wb.sheet_by_name("Mobilipass - 6") # ("Mobilipass") + long_fieldnames = sheet.row_values(1) # row 2 contains the long labels + nb_fields = len(long_fieldnames) + for i in range(2, sheet.nrows, 1): + person_id = sheet.cell_value(i, 0) # person id is in the first column + if sheet.row_len(i) == nb_fields: # some rows may not include all fields + if person_id not in data: + data[person_id] = dict() + for j in range (0, nb_fields): + field = long_fieldnames[j] + val = sheet.cell_value(i, j) + data[person_id][field] = val + else: + print("Ignored row: " + str(sheet.row_values(i))) + return long_fieldnames, data + diff --git a/dist/mongiris-0.40-py3.8.egg b/dist/mongiris-0.40-py3.8.egg new file mode 100644 index 0000000000000000000000000000000000000000..d24dfb4c3285a29fe94d97d9143ee628bee8606b Binary files /dev/null and b/dist/mongiris-0.40-py3.8.egg differ diff --git a/mongiris.egg-info/PKG-INFO b/mongiris.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..3150f1011b812573d0a222091dcdbd5c267f9c7d --- /dev/null +++ b/mongiris.egg-info/PKG-INFO @@ -0,0 +1,92 @@ +Metadata-Version: 1.1 +Name: mongiris +Version: 0.40 +Summary: This package is an interface for querying INSEE IRIS stored as documents in MongoDB. Requires loading the IRIS files into MongoDB prior to using this package. +Home-page: https://gitlab.liris.cnrs.fr/fduchate/mongiris +Author: Fabien Duchateau +Author-email: fabien.duchateau@univ-lyon1.fr +License: UNKNOWN +Download-URL: https://gitlab.liris.cnrs.fr/fduchate/mongiris +Description: # mongiris package + + This Python package is an interface for querying French administrative areas ([IRIS](https://www.insee.fr/fr/metadonnees/definition/c1523), similar to neighborhoods) stored as documents in MongoDB. + + Each IRIS includes indicators (e.g., average income, types of housings, number of bakeries or schools) that are useful for social sciences studies, for house/neighborhood recommendation, etc. + + In this package, the ~50,000 IRIS and their 350-650 indicators have been integrated and stored in the [GeoJSON format](https://geojson.org/), and an API enables the manipulation of these data. + + ## Prerequisites + + - Python, version >=3 + - [MongoDB](https://www.mongodb.com/), version >=4, in which it is necessary to import the IRIS database (see Installation). + + ## Installation + + To install mongiris (and its dependencies): + + ``` + python3 -m pip install git+https://fduchate@gitlab.liris.cnrs.fr/fduchate/mongiris.git#egg=mongiris + ``` + + + Next, you need to load the IRIS data into MongoDB (using the `mongorestore` tool). + - download the [dump of the database](https://gitlab.liris.cnrs.fr/fduchate/mongiris/raw/master/mongiris/data/dump/dump-dbinsee.bin) (724 MB) + - open a terminal and run: + ``` + mongorestore --archive=/path/to/dump-dbinsee.bin + ``` + + where `/path/to/` indicates the path to the downloaded dump database. <!--(provided with the source package mongiris in `mongiris/data/dump/dump-dbinsee.bin`).--> + This restoration may take a few minutes as the geospatial indexes are rebuilt. + + ## Usage + + In MongoDB, the database is named `dbinsee`. It contains three collections: + - `collsources` stores information about original data sources (title, release date, geographical information) + - `collindic` stores information about indicators (short label, full label, data sources in which it appears). + - `colliris` is the main collection, which stores each IRIS with its indicators (according to the [GeoJSON format](https://geojson.org/)) + + To manipulate the database, simply connect to MongoDB by creating an object of the `Mongiris` class. + Using this object, twenty methods are available for querying the data. + + Below is a minimal example of connection and queries (from `tests/dummy.py` file): + + ``` + from mongiris.api import Mongiris + + db = Mongiris() + + # return the number of documents in a collection + counts = db.count_documents(db.collection_indic, {}) + + # get complete information about iris identified with code 593500203 + iris = db.find_one_document(db.collection_iris, {"properties.CODE_IRIS": "593500203"}) + print(iris) + + # get iris which contains coordinates 3.685111, 46.514643 + iris2 = db.point_in_which_iris([3.685111, 46.514643]) + print(iris2) + ``` + + More examples, including testing geospatial queries, are available in the `tests/api_tests.py` file. + + + ## Contributors + + - Fabien Duchateau, Franck Favetta (laboratory [LIRIS](https://liris.cnrs.fr/), Université Lyon 1) + + - Loïc Bonneval (laboratory [CMW](https://www.centre-max-weber.fr/), Université Lyon 2) + + + ## Acknowledgments + + Data source provider : + - [INSEE](https://www.insee.fr/) + - [IGN](http://professionnels.ign.fr/contoursiris) + + Financial support : + - Labex [Intelligence des Mondes Urbains (IMU)](http://imu.universite-lyon.fr/projet/hil) + + +Keywords: MongoDB,INSEE,IRIS,data management +Platform: UNKNOWN diff --git a/mongiris.egg-info/SOURCES.txt b/mongiris.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..b07066294f7ed0f20d99845b8c62c5f9932df559 --- /dev/null +++ b/mongiris.egg-info/SOURCES.txt @@ -0,0 +1,14 @@ +README.md +setup.cfg +setup.py +mongiris/__init__.py +mongiris/api.py +mongiris/config.py +mongiris/integrator.py +mongiris/xls_utils.py +mongiris.egg-info/PKG-INFO +mongiris.egg-info/SOURCES.txt +mongiris.egg-info/dependency_links.txt +mongiris.egg-info/not-zip-safe +mongiris.egg-info/requires.txt +mongiris.egg-info/top_level.txt \ No newline at end of file diff --git a/mongiris.egg-info/dependency_links.txt b/mongiris.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/mongiris.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/mongiris.egg-info/not-zip-safe b/mongiris.egg-info/not-zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/mongiris.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/mongiris.egg-info/requires.txt b/mongiris.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..492c62ec2fab729d8f0082ac44cc110d184b02e1 --- /dev/null +++ b/mongiris.egg-info/requires.txt @@ -0,0 +1,2 @@ +pymongo>=3.7.2 +xlrd>=1.2.0 diff --git a/mongiris.egg-info/top_level.txt b/mongiris.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe329d182873557ae3588bf40ebd19449554213d --- /dev/null +++ b/mongiris.egg-info/top_level.txt @@ -0,0 +1 @@ +mongiris diff --git a/mongiris/api.py b/mongiris/api.py index 6eb822bfe667e6e33591256d51c24e2bedff5fc8..5ddf4e308af50577ba8e7eb66d05d7d2036ec878 100755 --- a/mongiris/api.py +++ b/mongiris/api.py @@ -161,6 +161,19 @@ class Mongiris: doc_json = Mongiris.bson_to_json(cursor) return doc_json + def find_all(self, collection): + """ + Finds all the elements in the given collection. + + Args: + collection: an instance of Collection + + Returns: + all_elements: all the elements in the given collection + """ + all_elements = collection.find() + return all_elements + def get_random_document(self, collection): """ Returns a random document from the given collection. """ random_iris = collection.aggregate([{"$sample": {"size": 1}}]).next() diff --git a/mongiris/data/dump/dump-dbinsee.bin b/mongiris/data/dump/dump-dbinsee.bin index 47828b69836c352e20c76d09d655dd47d6e89d50..dc0deaaf755ae5df9106ccfce9f898b10922fed0 100644 Binary files a/mongiris/data/dump/dump-dbinsee.bin and b/mongiris/data/dump/dump-dbinsee.bin differ