diff --git a/mongiris/api.py b/mongiris/api.py index 6eb822bfe667e6e33591256d51c24e2bedff5fc8..d2c0ffd40f3625d649578bfcd73ee8f72ece8ada 100755 --- a/mongiris/api.py +++ b/mongiris/api.py @@ -10,8 +10,8 @@ import pymongo from bson import json_util # used to convert BSON to JSON (especially ObjectId type of "_id") import json import logging -from mongiris import config +max_timeout = 3 # delay before connection timeout class Mongiris: """ @@ -31,6 +31,10 @@ class Mongiris: An example of IRIS following the GeoJSON format is provided in `data/example-iris.json`. + Args: + db: the database name to connect to (string) + collection: the collection name containing neighbourhoods (string) + Additional resources: - [MongoDB documentation](http://www.mongodb.org/) @@ -41,15 +45,13 @@ class Mongiris: """ - def __init__(self): + def __init__(self, db, collection): logging.basicConfig(format='[%(levelname)s] - %(name)s - %(asctime)s : %(message)s') self.logger = logging.getLogger() self.logger.setLevel(logging.INFO) self.connection, self.connection_status = self.init_connection() # default connection on 'localhost', 27017 - self.database = self.connection[config.database_iris] # database for HiL project - self.collection_iris = self.database[config.collection_iris] - self.collection_indic = self.database[config.collection_indic] - self.collection_sources = self.database[config.collection_sources] + self.database = self.connection[db] # link to database + self.collection_neighbourhoods = self.database[collection] # link to collection @staticmethod def bson_to_json(doc_bson): @@ -77,7 +79,7 @@ class Mongiris: connection_status = True # by default, connection is ok connection = None try: - connection = pymongo.MongoClient(serverSelectionTimeoutMS=config.max_timeout) # default localhost:27017 + connection = pymongo.MongoClient(serverSelectionTimeoutMS=max_timeout) # default localhost:27017 connection.server_info() # forces a query to MongoDB (for checking the connection) except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.error('Could not connect to the MongoDB database ! Have you launched MongoDB ? ' + str(e)) @@ -132,18 +134,32 @@ class Mongiris: doc_json = Mongiris.bson_to_json(doc) return doc_json - def get_iris_from_code(self, code_iris): + def find_all(self, collection): + """ + Finds all documents in the given collection. + + Args: + collection: a string representing the collection name + + Returns: + doc_json: a dictionnary representing the documents, or None + """ + doc = collection.find() + doc_json = Mongiris.bson_to_json(doc) + return doc_json + + def get_neighbourhood_from_code(self, code_neighbourhood): """ - Returns the iris identified by the given code_iris. + Returns the neighbourhood identified by the given code_neighbourhood. Args: - code_iris: a string containing the code of the searched iris + code_neighbourhood: a string containing the code of the searched neighbourhood Returns: - iris: a dictionary representing an iris, or None + neighbourhood: a dictionary representing a neighbourhood, or None """ - iris = self.find_one_document(self.collection_iris, {"properties.CODE_IRIS": code_iris}) - return iris + neighbourhood = self.find_one_document(self.collection_neighbourhoods, {"properties.CODE_IRIS": code_neighbourhood}) + return neighbourhood def find_documents(self, collection, json_query, json_projection=None): """ @@ -228,7 +244,7 @@ class Mongiris: def geo_within(self, collection, geometry, json_projection=None): """ Finds all documents from given collection and which contain totally the given geometry. - Cannot be used to find the IRIS containing a point (geometry must be a polygon). + Cannot be used to find the neighbourhood containing a point (geometry must be a polygon). Args: collection: a string representing the collection name @@ -245,7 +261,7 @@ class Mongiris: def geo_within_sphere(self, collection, sphere, json_projection=None): """ Finds all documents from given collection and which contain totally the given sphere. - Cannot be used to find the IRIS containing a point (geometry must be a polygon, with min. 3 points). + Cannot be used to find the neighbourhood containing a point (geometry must be a polygon, with min. 3 points). Args: collection: a string representing the collection name @@ -323,11 +339,12 @@ class Mongiris: """ return {"type": "Polygon", "coordinates": coordinates} - def point_in_which_iris(self, coordinates, json_projection=None): + def point_in_which_neighbourhood(self, coordinates, json_projection=None): """ - Finds the document (IRIS) containing the given coordinates. Uses near() since geo_within() requires a Polygon. - Careful: the near() operator may return several iris (low probability since distance = 1 meter) and only the - first one is returned. + Finds the document (neighbourhood) containing the given coordinates. Uses near() since geo_within() requires a + Polygon. + Careful: the near() operator may return several neighbourhoods (low probability since distance = 1 meter) and + only the first one is returned. Args: coordinates: an array of coordinates (long, lat) @@ -336,7 +353,7 @@ class Mongiris: Returns: doc_json: a json document or None """ - results = self.near(self.collection_iris, coordinates, json_projection, 1) # distance = 1 meter + results = self.near(self.collection_neighbourhoods, coordinates, json_projection, 1) # distance = 1 meter if len(results) == 0: return None return results[0] @@ -361,26 +378,26 @@ class Mongiris: return doc_json @staticmethod - def adjacent(collection, geometry, json_projection=None, distance=20, exclude_geometry_iris=None): + def adjacent(collection, geometry, json_projection=None, distance=20, exclude_geometry=None): """ - Finds all adjacent neighbors of an iris represented by geometry. - No adjacent function, so use all coordinates of an iris and find the closest iris (according to distance). - Could be done directly with near(), but near() is less accurate and thus incomplete. + Finds all adjacent neighbors of a neighbourhood represented by geometry. + No adjacent function, so use all coordinates of an iris and find the closest neighbourhood (according to + distance). Could be done directly with near(), but near() is less accurate and thus incomplete. Args: collection: a string representing the collection name geometry: a geojson geometry (Point, Polygon, etc.) json_projection: a json document indicating the fields that appear in the results distance: the maximum distance for an adjacent neighbour, in meters (10 to 50 meters are fine) - exclude_geometry_iris: the document _id of the iris represented by geometry, if it needs to be excluded + exclude_geometry: the document _id of the neighbourhood represented by geometry, if it needs to be excluded Returns: doc_json: a cursor (set of documents) """ results = list() results_ids = list() - if exclude_geometry_iris is not None: # to exclude the iris represented by geometry, add it to the results ids - results_ids.append(exclude_geometry_iris) + if exclude_geometry is not None: # to exclude the neighbourhood represented by geometry, add it to the results ids + results_ids.append(exclude_geometry) for coords in geometry["coordinates"][0]: geometry_coords = Mongiris.get_geojson_point(coords) cursor = collection.find({"geometry": {"$near": {"$geometry": geometry_coords, diff --git a/mongiris/config.py b/mongiris/config.py deleted file mode 100644 index 4fffd75edc8785f2ba606341c87056bc87c56328..0000000000000000000000000000000000000000 --- a/mongiris/config.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -# ============================================================================= -# Configuration file -# ============================================================================= - -from os import path - -# mongoDB parameters -database_iris = "dbinsee" # name of the database -collection_iris = "colliris" # collection containing the ~50,000 IRIS and values for indicators -collection_indic = "collindic" # collection containing metadata information about indicators (short label, long label) -collection_sources = "collsources" # collection containing metadata information about sources (filepath, title, etc.) -max_timeout = 3 # delay before connection timeout - -# data parameters -insee_dir = path.join('data', 'insee', '2019-01') # the date indicates the last check for updates of INSEE files - -# labels for geojson iris files and for the json indicators dictionary -geojson_raw_indicators_label = 'raw_indicators' -geojson_grouped_indicators_label = 'grouped_indicators' -geojson_insee_files_label = 'insee_files' -geojson_shortname_label = 'short_label' -geojson_longname_label = 'full_label' -geojson_from_files_label = 'from_insee_files' - -# the following labels are metadata about an IRIS, not indicators -labels_dictionary_init = {'CODE_IRIS': {"long_fieldname": 'Code IRIS'}, 'NOM_IRIS': {"long_fieldname": 'Nom IRIS'}, - 'INSEE_COM': {"long_fieldname": 'Code postal commune'}, - 'NOM_COM': {"long_fieldname": 'Nom commune'}, 'IRIS': {"long_fieldname": 'Code IRIS'}, - 'TYP_IRIS': {"long_fieldname": 'Type IRIS'}, 'REG': {"long_fieldname": 'Code région'}, - 'DEP': {"long_fieldname": 'Code département'}, 'TRIRIS': {"long_fieldname": 'TRIRIS'}, - 'UU2010': {"long_fieldname": 'Unité urbaine'}, 'LIBIRIS': {"long_fieldname": 'Libellé IRIS'}, - 'GRD_QUART': {"long_fieldname": 'Grand quartier'}, 'LAB_IRIS': {"long_fieldname": 'Label qualité IRIS'}, - 'MODIF_IRIS': {"long_fieldname": 'Type de modification de IRIS'}, - } - -# a list of grouped indicators -labels_grouped_indicators = ['logement-resident', 'education-superieur-prive', 'animation-culturel', - 'education-secondaire-cycle1-public', 'education-secondaire-cycle2-professionnel-public', - 'animation-commerce-nonalimentaire', 'education-primaire-prive', 'espacevert', - 'service-sante', 'service-divers-public', 'education-secondaire-cycle2-professionnel-prive', - 'education-secondaire-cycle2-general-public', 'education-creche', 'animation-divertissement', - 'animation-commerce-alimentaire-proximite', 'csp', 'service-divers-prive', - 'animation-commerce-alimentaire-grandesurface', 'logement-type', - 'education-secondaire-cycle2-general-prive', 'transport-longuedistance', - 'transport-busmetrotram', 'logement-annee', 'transport-velo', 'logement-residence', - 'securite', 'education-secondaire-cycle1-prive', 'loisir', 'education-primaire-public', - 'service-justice', 'service-emploi', 'education-superieur-public', 'service-actionsociale', ] - - - - - diff --git a/mongiris/integrator.py b/mongiris/integrator.py index 78ed3a7118729ce9ffe296ea61ff0339c972f31c..d6548eb953f2f76fa5aa0ddaccc07c9ff13c08c1 100644 --- a/mongiris/integrator.py +++ b/mongiris/integrator.py @@ -18,11 +18,34 @@ # ============================================================================= import os +from os import path import logging -from mongiris import config from mongiris import xls_utils from mongiris import api +collection_indic = "collindic" +collection_sources = "collsources" +# labels for geojson iris files and for the json indicators dictionary +geojson_raw_indicators_label = 'raw_indicators' +geojson_grouped_indicators_label = 'grouped_indicators' +geojson_insee_files_label = 'insee_files' +geojson_shortname_label = 'short_label' +geojson_longname_label = 'full_label' +geojson_from_files_label = "from_insee_files" +# data parameters +insee_dir = path.join('data', 'insee', '2019-01') # the date indicates the last check for updates of INSEE files +# the following labels are metadata about an IRIS, not indicators +labels_dictionary_init = {'CODE_IRIS': {"long_fieldname": 'Code IRIS'}, 'NOM_IRIS': {"long_fieldname": 'Nom IRIS'}, + 'INSEE_COM': {"long_fieldname": 'Code postal commune'}, + 'NOM_COM': {"long_fieldname": 'Nom commune'}, 'IRIS': {"long_fieldname": 'Code IRIS'}, + 'TYP_IRIS': {"long_fieldname": 'Type IRIS'}, 'REG': {"long_fieldname": 'Code région'}, + 'DEP': {"long_fieldname": 'Code département'}, 'TRIRIS': {"long_fieldname": 'TRIRIS'}, + 'UU2010': {"long_fieldname": 'Unité urbaine'}, 'LIBIRIS': {"long_fieldname": 'Libellé IRIS'}, + 'GRD_QUART': {"long_fieldname": 'Grand quartier'}, + 'LAB_IRIS': {"long_fieldname": 'Label qualité IRIS'}, + 'MODIF_IRIS': {"long_fieldname": 'Type de modification de IRIS'}, + } + def get_all_xlsx_files(dir_sources): """ @@ -37,7 +60,7 @@ def get_all_xlsx_files(dir_sources): """ insee_files = list() for file in os.listdir(dir_sources): - filepath = os.path.join(config.insee_dir, file) + filepath = os.path.join(insee_dir, file) if os.path.isfile(filepath) and filepath.endswith(".xls"): insee_files.append(filepath) return insee_files @@ -57,24 +80,24 @@ def integrate_xls_file(connection, xls_file): indicator_metadata, indicators, source_metadata = xls_utils.parse_xls_to_dict(xls_file) # add the source metadata into collection_sources - doc = connection.find_one_document(connection.collection_sources, {"filename": xls_file}) + doc = connection.find_one_document(collection_sources, {"filename": xls_file}) if doc is None: - connection.insert_one_document(connection.collection_sources, source_metadata) + connection.insert_one_document(collection_sources, source_metadata) # add the indicators information into collection_indic for ind in indicator_metadata: - short_name = ind[config.geojson_shortname_label] - doc = connection.find_one_document(connection.collection_indic, {config.geojson_shortname_label: short_name}) + short_name = ind[geojson_shortname_label] + doc = connection.find_one_document(collection_indic, {geojson_shortname_label: short_name}) if doc is not None: # only update field from_insee_files, $addToSet does not add duplicate values - connection.update_one_document(connection.collection_indic, {config.geojson_shortname_label: short_name}, - {"$addToSet": {config.geojson_from_files_label: xls_file}}) + connection.update_one_document(collection_indic, {geojson_shortname_label: short_name}, + {"$addToSet": {geojson_from_files_label: xls_file}}) else: # add the document - connection.insert_one_document(connection.collection_indic, ind) + connection.insert_one_document(collection_indic, ind) # add the indicators values into collection_iris not_found_iris = found_iris = nb_replacements = 0 for code_iris, indics in indicators.items(): - doc = connection.get_iris_from_code(code_iris) + doc = connection.get_neighbourhood_from_code(code_iris) if doc is None: not_found_iris += 1 else: @@ -83,17 +106,18 @@ def integrate_xls_file(connection, xls_file): need_replacement = False doc_changes = {"$set": dict()} # an update document containing the new fields for ind, value in indics.items(): - if ind in config.labels_dictionary_init: # main indicator, do not replace it if already there + if ind in labels_dictionary_init: # main indicator, do not replace it if already there if ind not in doc['properties']: # general information (code iris, city name, etc.) doc_changes["$set"]["properties." + ind] = value need_replacement = True else: # raw indicator - if ind not in doc['properties'][config.geojson_raw_indicators_label]: # add this value - doc_changes["$set"]["properties." + config.geojson_raw_indicators_label + "." + ind] = value + if ind not in doc['properties'][geojson_raw_indicators_label]: # add this value + doc_changes["$set"]["properties." + geojson_raw_indicators_label + "." + ind] = value need_replacement = True if need_replacement: # replace the old doc by new doc - res = connection.update_one_document(connection.collection_iris, {'properties.CODE_IRIS': doc_id}, doc_changes) + res = connection.update_one_document(connection.collection_neighbourhoods, + {'properties.CODE_IRIS': doc_id}, doc_changes) nb_replacements += res.modified_count logger.info(f"\t\t{xls_file}: {found_iris} found iris, {not_found_iris} not found iris, {nb_replacements} updates") @@ -128,10 +152,10 @@ def check_properties(connection): Args: connection: an object representing the database connection """ - docs = connection.find_documents(connection.collection_iris, {}, ) + docs = connection.find_documents(connection.collection_neighbourhoods, {}, ) counts = dict() for doc in docs: - nb = len(doc['properties'][config.geojson_raw_indicators_label]) + nb = len(doc['properties'][geojson_raw_indicators_label]) if nb not in counts: counts[nb] = 0 counts[nb] += 1 @@ -143,12 +167,10 @@ def check_properties(connection): ######################### if __name__ == '__main__': - logging.basicConfig(format='[%(levelname)s] - %(name)s - %(asctime)s : %(message)s') logger = logging.getLogger() logger.setLevel(logging.INFO) - connection = api.Mongiris() # database connection - integrate(connection, config.insee_dir) # integrating data sources - check_properties(connection) # stats about properties 5min execution {350: 36530, 638: 11738, 615: 1057, 373: 79} - + connection_hil = api.Mongiris("dbinsee", "colliris") # database connection + integrate(connection_hil, insee_dir) # integrating data sources + check_properties(connection_hil) # stats about properties: 5min exec., {350: 36530, 638: 11738, 615: 1057, 373: 79} diff --git a/mongiris/tests/api_tests.py b/mongiris/tests/api_tests.py index 77ddba1d4cb9c611fed94c1dc32533e7910ec166..1adfc85a36d17956b8b6eff643d3f39da2a6124e 100644 --- a/mongiris/tests/api_tests.py +++ b/mongiris/tests/api_tests.py @@ -13,109 +13,109 @@ import re class TestCase(unittest.TestCase): """ A class for Mongiris unit tests. - Some tests select a random iris in the collection,so there is no assert check. + Some tests select a random neighbourhood in the collection,so there is no assert check. """ def setUp(self): # a setup (connection to MongoDB) executed before each test - self.db = Mongiris() + self.db = Mongiris('dbinsee', 'colliris') def test_count(self): # test for counting the number of documents - count = self.db.count_documents(self.db.collection_iris, {}) - assert (count == 49404), 'Error: expecting 49404 IRIS to be stored in MongoDB, but stored %i' % count + count = self.db.count_documents(self.db.collection_neighbourhoods, {}) + assert (count == 49404), f'Error: expecting 49404 neighbourhood to be stored in MongoDB, but stored {count}' def test_find_one(self): # test for finding a specific document using a field CODE_IRIS - iris = self.db.find_one_document(self.db.collection_iris, {"properties.CODE_IRIS": "593500203"}) - assert(iris is not None), 'Function get_iris_from_code("593500203") should return one document, not None' - self.db.logger.info(iris) + neighbourhood = self.db.find_one_document(self.db.collection_neighbourhoods, {"properties.CODE_IRIS": "593500203"}) + assert(neighbourhood is not None), 'Function get_iris_from_code("593500203") should return one document, not None' + self.db.logger.info(neighbourhood) def test_find_documents(self): - # test for finding documents containing a given string (either in teh city name or in the IRIS name) + # test for finding documents containing a given string (either in teh city name or in the neighbourhood name) regx = re.compile("Lyon", re.IGNORECASE) query_clause = {"$or": [{"properties.NOM_IRIS": {"$regex": regx}}, {"properties.NOM_COM": {"$regex": regx}}]} - cursor = self.db.find_documents(self.db.collection_iris, query_clause) + cursor = self.db.find_documents(self.db.collection_neighbourhoods, query_clause) for doc in cursor: self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"] + "\t" + doc["properties"]["NOM_COM"]) - assert (len(cursor) == 203), "Method test_find_documents() for Lyon should find 203 iris, not " + str(len(cursor)) + assert (len(cursor) == 203), "Method test_find_documents() for Lyon should find 203 neighbourhoods, not " + str(len(cursor)) - def test_get_iris_from_code(self): - # test for retrieving an iris from code - iris = self.db.get_iris_from_code("593500203") - assert(iris is not None), 'Function find_one_document(..., "593500203") should return one document, not None' - self.db.logger.info(iris) + def test_get_neighbourhood_from_code(self): + # test for retrieving an neighbourhood from code + neighbourhood = self.db.get_neighbourhood_from_code("593500203") + assert(neighbourhood is not None), 'Function find_one_document(..., "593500203") should return one document, not None' + self.db.logger.info(neighbourhood) def test_op_geo_within(self): - # test for a geospatial query that returns all iris within a given polygon - self.db.logger.info("Finding all IRIS in area [[2.3530807599035124, 50.865983520113346], [2.607984899697411, 50.98885556985259]]") + # test for a geospatial query that returns all neighbourhood within a given polygon + self.db.logger.info("Finding all neighbourhood in area [[2.3530807599035124, 50.865983520113346], [2.607984899697411, 50.98885556985259]]") long1 = 2.3530807599035124 lat1 = 50.865983520113346 long2 = 2.607984899697411 lat2 = 50.98885556985259 polygon = Mongiris.convert_geojson_box_to_polygon(long1, lat1, long2, lat2) - cursor = self.db.geo_within(self.db.collection_iris, polygon) + cursor = self.db.geo_within(self.db.collection_neighbourhoods, polygon) for doc in cursor: self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) - assert(len(cursor) == 12), "Method test_op_geo_within() should find 12 iris, not " + str(len(cursor)) + assert(len(cursor) == 12), "Method test_op_geo_within() should find 12 neighbourhoods, not " + str(len(cursor)) def test_op_near(self): - # test for a geospatial query that returns all iris within a distance of a given iris - random_iris = self.db.get_random_document(self.db.collection_iris) + # test for a geospatial query that returns all neighbourhood within a distance of a given neighbourhood + random_iris = self.db.get_random_document(self.db.collection_neighbourhoods) point_random_iris = random.choice(random_iris["geometry"]["coordinates"][0]) # get a random coordinate - self.db.logger.info("Finding all near IRIS for " + random_iris["properties"]["NOM_IRIS"] + " " + + self.db.logger.info("Finding all near neighbourhoods for " + random_iris["properties"]["NOM_IRIS"] + " " + random_iris["properties"]["CODE_IRIS"]) distance_max = 3000 # in meters - cursor = self.db.near(self.db.collection_iris, point_random_iris, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}, + cursor = self.db.near(self.db.collection_neighbourhoods, point_random_iris, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}, distance_max) for doc in cursor: self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) def test_op_intersect(self): - # test for a geospatial query that returns all iris that intersect a given iris - random_iris = self.db.get_random_document(self.db.collection_iris) + # test for a geospatial query that returns all neighbourhood that intersect a given neighbourhood + random_iris = self.db.get_random_document(self.db.collection_neighbourhoods) geometry_random_iris = random_iris["geometry"] - self.db.logger.info("Finding all intersecting IRIS for " + random_iris["properties"]["NOM_IRIS"] + " " + + self.db.logger.info("Finding all intersecting neighbourhoods for " + random_iris["properties"]["NOM_IRIS"] + " " + random_iris["properties"]["CODE_IRIS"]) - cursor = self.db.intersect(self.db.collection_iris, geometry_random_iris, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}) + cursor = self.db.intersect(self.db.collection_neighbourhoods, geometry_random_iris, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}) for doc in cursor: self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) def test_op_adjacent(self): - # test for a geospatial query that returns all iris adjacent to a given iris (or very close, < 10 meters) - random_iris = self.db.get_random_document(self.db.collection_iris) + # test for a geospatial query that returns all neighbourhood adjacent to a given neighbourhood (or very close, < 10 meters) + random_iris = self.db.get_random_document(self.db.collection_neighbourhoods) geometry_random_iris = random_iris["geometry"] - self.db.logger.info("Finding all adjacent IRIS for " + random_iris["properties"]["NOM_IRIS"] + " " + + self.db.logger.info("Finding all adjacent neighbourhoods for " + random_iris["properties"]["NOM_IRIS"] + " " + random_iris["properties"]["CODE_IRIS"]) distance_max = 10 # in meters - results = self.db.adjacent(self.db.collection_iris, geometry_random_iris, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}, + results = self.db.adjacent(self.db.collection_neighbourhoods, geometry_random_iris, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}, distance_max, random_iris["_id"]) for doc in results: self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) - def test_op_point_in_which_iris(self): - # tests for a geospatial query that returns the iris (or None) which contains a given point + def test_op_point_in_which_neighbourhood(self): + # tests for a geospatial query that returns the neighbourhood (or None) which contains a given point # test 1 - self.db.logger.info("Finding IRIS for (3.685111, 46.514643)") - doc = self.db.point_in_which_iris([3.685111, 46.514643]) + self.db.logger.info("Finding neighbourhood for (3.685111, 46.514643)") + doc = self.db.point_in_which_neighbourhood([3.685111, 46.514643]) assert (doc is not None), 'Coordinates (3.685111, 46.514643) should return one document, not None' - assert (doc["properties"]["CODE_IRIS"] == '031020000'), 'Coordinates (3.685111, 46.514643) should correspond to IRIS 031020000' + assert (doc["properties"]["CODE_IRIS"] == '031020000'), 'Coordinates (3.685111, 46.514643) should correspond to neighbourhood 031020000' self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) # test 2 - self.db.logger.info("Finding IRIS for (5.685111, 46.514643)") - doc = self.db.point_in_which_iris([5.685111, 46.514643]) + self.db.logger.info("Finding neighbourhood for (5.685111, 46.514643)") + doc = self.db.point_in_which_neighbourhood([5.685111, 46.514643]) assert (doc is not None), 'Coordinates (5.685111, 46.514643) should return one document, not None' - assert (doc["properties"]["CODE_IRIS"] == '391750000'), 'Coordinates (5.685111, 46.514643) should correspond to IRIS 391750000' + assert (doc["properties"]["CODE_IRIS"] == '391750000'), 'Coordinates (5.685111, 46.514643) should correspond to neighbourhood 391750000' self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) # test 3 - self.db.logger.info("Finding IRIS for (-49.016200, 79.333879)") - doc = self.db.point_in_which_iris([-49.016200, 79.333879]) - assert (doc is None), 'Coordinates (79.333879, -49.016200) should return no document (no corresponding IRIS)' + self.db.logger.info("Finding neighbourhood for (-49.016200, 79.333879)") + doc = self.db.point_in_which_neighbourhood([-49.016200, 79.333879]) + assert (doc is None), 'Coordinates (79.333879, -49.016200) should return no document (no corresponding neighbourhood)' def test_test(self): # for testing code coord = [4.8300768, 45.720019] - cursor = self.db.near(self.db.collection_iris, coord, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}, + cursor = self.db.near(self.db.collection_neighbourhoods, coord, {"properties.NOM_IRIS": 1, "properties.CODE_IRIS": 1}, 1000) for doc in cursor: self.db.logger.info(str(doc["_id"]) + "\t" + doc["properties"]["NOM_IRIS"] + "\t" + doc["properties"]["CODE_IRIS"]) diff --git a/paper.bib b/paper.bib deleted file mode 100644 index 9f41b76d322aea54c62856dffde3bdee6a444792..0000000000000000000000000000000000000000 --- a/paper.bib +++ /dev/null @@ -1,91 +0,0 @@ -@INPROCEEDINGS{egc19-demo, - author = {Nelly Barret and Fabien Duchateau and Franck Favetta and Maryvonne Miquel and Aurélien Gentil and Loïc Bonneval}, - year = {2019}, - title = {À la recherche du quartier idéal}, - booktitle = {Extraction et Gestion des Connaissances (EGC)}, - pages = {429–432}, -} - -@book{christen2012data, - title={Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection}, - author={Christen, Peter}, - year={2012}, - publisher={Springer Science \& Business Media} -} - -@article{RealEstate2013, -title = "Toward a user-oriented recommendation system for real estate websites", -journal = "Information Systems", -volume = "38", -number = "2", -pages = "231-243", -year = "2013", -issn = "0306-4379", -doi = "https://doi.org/10.1016/j.is.2012.08.004", -url = "http://www.sciencedirect.com/science/article/pii/S0306437912001081", -author = "Xiaofang Yuan and Ji-Hyun Lee and Sun-Joong Kim and Yoon-Hyun Kim", -keywords = "Home buyer, Real estate website, Housing search behavior, Case-based recommendation system, Ontology" -} - -@article{le2015soho, - title={{Where Is the Soho of Rome? Measures and Algorithms for Finding Similar Neighborhoods in Cities}}, - author={Le Falher, G{\'e}raud and Gionis, Aristides and Mathioudakis, Michael}, - journal={ICWSM}, - volume={2}, - pages={3--2}, - year={2015} -} - -@misc{datafrance, - title={DataFrance}, - howpublished={https://datafrance.info/}, - year=2018 -} - -@misc{insee-iris, - title={{Definition of IRIS}}, - author={INSEE}, - howpublished={http://www.insee.fr/en/metadonnees/definition/c1523}, - year=2016 -} - -@inproceedings{airbnb2017, - title={{Comment les h{\^o}tes et clients d'Airbnb parlent-ils des lieux ? Une analyse exploratoire {\`a} partir du cas parisien}}, - booktitle={EXCES-EXtraction de Connaissances {\`a} partir de donn{\'e}Es Spatialis{\'e}es}, - author={Marianne Gu{\'e}rois and Malika Madelin}, - year={2017} -} - -@misc{tang2015neighborhood, - title={Neighborhood and price prediction for San Francisco Airbnb listings}, - author={Tang, Emily and Sangani, Kunal}, - year={2015}, - publisher={Stanford Univ., Stanford, CA, USA, Tech. Rep} -} - -@article{preteceille2009segregation, - title={La s{\'e}gr{\'e}gation ethno-raciale a-t-elle augment{\'e} dans la m{\'e}tropole parisienne?}, - author={Pr{\'e}teceille, Edmond}, - journal={Revue fran{\c{c}}aise de sociologie}, - volume={50}, - number={3}, - pages={489--519}, - year={2009}, - publisher={Editions Technip \& Ophrys} -} - -@article{authier2008citadins, - title={Les citadins et leur quartier.}, - author={Authier, Jean-Yves}, - journal={L'Ann{\'e}e sociologique}, - volume={58}, - number={1}, - pages={21--46}, - year={2008}, - publisher={Presses Universitaires de France}, - url={https://www.cairn.info/revue-l-annee-sociologique-2008-1-page-21.html} -} - - - - diff --git a/paper.md b/paper.md deleted file mode 100644 index ec8679c9fcdb0d96a9aa5e773b2626bfde3f36b4..0000000000000000000000000000000000000000 --- a/paper.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: 'Mongiris: a package for manipulating IRIS' -tags: - - Python - - MongoDB - - data management - - IRIS neighborhood - - INSEE indicators -authors: - - name: Fabien Duchateau - orcid: 0000-0001-6803-917X - affiliation: 1 - - name: Franck Favetta - orcid: 0000-0003-2039-3481 - affiliation: 1 -affiliations: - - name: LIRIS UMR5205, Université Claude Bernard Lyon 1, Lyon, France - index: 1 -date: 11 August 2019 -bibliography: paper.bib ---- - -# Statement of Need - -When studying geographical areas such as neighborhoods, it is necessary to collect various data according to the application domain (e.g., types of neighborhood, statistics such as criminality or unmployment rates, points of interests). -For instance, social science researchers study the relationship between citizens and their living area [@preteceille2009segregation;@authier2008citadins] or how they describe their neighborhood [@airbnb2017]. Computer science researchers are interested in recommending the most relevant neighborhood when buying a house [@RealEstate2013], in predicting price and types of neighborhoods [@tang2015neighborhood] or in detecting similar areas between different cities [@le2015soho]. - -National institutions (e.g., Open Data initiatives, INSEE in France) may produce data about neighborhoods, but they are usually spread in heterogenous files (databases, spreadsheets). Initiatives such as DataFrance [@datafrance] enable their visualization on a map, but their authors do not share collected data. -Thus, researchers have to manually collect and integrate raw data from national institutions, a challenging issue refered to as `data integration` [@christen2012data]. Although some tools such as OpenRefine or Talend facilitates this integration, they require expert knowledge and programming skills. Besides, spatial queries (e.g., neighborhoods located within a close distance), which are useful in a research context, are usually not directly available. -The French administration provides data about IRIS [@insee-iris], a small division unit of the national territory for statistical purposes (mostly with the same number of residents, thus mainly small-sized in cities and wider in rural areas). -To ease the exploitation of IRIS, we propose the package Mongiris, which includes integrated data about these neighborhoods (IRIS) and an API for manipulating them. - -# Summary - -The Python package is composed of two modules: integration and API. - -The `integration module` is responsible for extracting information from data sources. The module currently supports spreadsheets produced by [INSEE](https://www.insee.fr/). -Since data evolve (e.g., statistics from INSEE are updated every few years), the integration module may be run. Note that new data may be stored in different database or collections so that the evolution can be studied. -For most users, there is no need to use the integration module since a recent dump of the database is provided. It is mainly based on INSEE files from 2014 and 2016. -The current dump contains roughly 37,000 IRIS with 375 indicators and 12,800 IRIS with 640 indicators. -<!-- {362: 36530, 650: 11738, 627: 1057, 385: 79} --> - -The `API module` includes common operations such as searching for an IRIS (by IRIS code or according to any field value), inserting, updating or deleting an IRIS. -It also provides geospatial operations: get IRIS from coordinates, get all adjacent or close IRIS from a given IRIS, find all IRIS in a given area, etc. - -The Mongiris package is currently used in Mapiris, a tool for visualizing and searching for IRIS. - - - -It also powers VizLiris, a prototype for clustering similar IRIS or for recommending relevant neighborhoods according to user needs [@egc19-demo]. - -<!-- -{ width=50%} -{ width=50% } ---> - -{ width=50%} \ { width=50% } -\begin{figure}[!h] -\caption{Screenshot of VizLiris - clustering (left) and recommendation (right)} -\end{figure} - -# Acknowledgements - -This work has been partially funded by LABEX IMU (ANR-10-LABX-0088) from Université de Lyon, in the context of the program "Investissements d'Avenir" (ANR-11-IDEX-0007) from the French Research Agency (ANR). - -# References diff --git a/paper.pdf b/paper.pdf deleted file mode 100644 index d4c30ffffb7e54cdf5675a481015adebe4783b2e..0000000000000000000000000000000000000000 Binary files a/paper.pdf and /dev/null differ diff --git a/setup.cfg b/setup.cfg index 922bd043fac14fd7f27050c3bf55353f8766fc30..7a324b2bfb302cd48942eb764813e337d7c51a26 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] name = mongiris -version = 0.40 +version = 0.50 description = This package is an interface for querying INSEE IRIS stored as documents in MongoDB. Requires loading the IRIS files into MongoDB prior to using this package. author = Fabien Duchateau author_email = fabien.duchateau@univ-lyon1.fr