From 637949fa23ff4e10d71f37f34a43bae08e944cc9 Mon Sep 17 00:00:00 2001 From: jfize <jacques.fize@insa-lyon.fr> Date: Mon, 3 Feb 2020 15:04:02 +0100 Subject: [PATCH] Add train test split on geographic criteria + Ngram Index state can be saved+add doc to utils class and func+add multilinguage possibility in the Wikidata extraction +Update README and requirements --- .gitignore | 1 + 1_extractDataFromWikidata.py | 25 +- README.md | 37 +- combination_embeddings.py | 88 ++++- .../toponym_combination_embedding.json | 6 +- requirements.txt | 5 +- train_test_split_geonames.py | 83 +++++ utils.py | 347 ++++++++++++++++-- 8 files changed, 518 insertions(+), 74 deletions(-) create mode 100644 train_test_split_geonames.py diff --git a/.gitignore b/.gitignore index 9f23506..d004fc1 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,4 @@ WikipediaExtract/* test_comb.sh .vscode/* +notes.md \ No newline at end of file diff --git a/1_extractDataFromWikidata.py b/1_extractDataFromWikidata.py index 64c777c..3048c2c 100644 --- a/1_extractDataFromWikidata.py +++ b/1_extractDataFromWikidata.py @@ -34,17 +34,20 @@ def job(line): try: data = json.loads(line.strip(",\n")) if "sitelinks" in data and "claims" in data: - if "enwiki" in data["sitelinks"]: - id_ = data["id"] - coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"] - title = data["sitelinks"]["enwiki"]["title"] - url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_")) - lat = coords_data["latitude"] - lon = coords_data["longitude"] - classes_ = "" - for claimP31 in data["claims"]["P31"]: - classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"]) - output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_"))) + if "enwiki" in data["sitelinks"] or "frwiki" in data["sitelinks"]: + page_available = [i for i in ["en","fr"] if i+"wiki" in data["sitelinks"]] + for site in page_available: + site = 'en' if 'enwiki' in data["sitelinks"] else 'fr' + id_ = data["id"] + coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"] + title = data["sitelinks"]["{0}wiki".format(site)]["title"] + url = "https://{1}.wikipedia.org/wiki/{0}".format(title.replace(" ","_"),site) + lat = coords_data["latitude"] + lon = coords_data["longitude"] + classes_ = "" + for claimP31 in data["claims"]["P31"]: + classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"]) + output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_"))) except Exception: # First Line is "['" and last line is "]'" pass diff --git a/README.md b/README.md index de19b3a..84d5178 100644 --- a/README.md +++ b/README.md @@ -88,19 +88,32 @@ Gensim word2vec format is saved in the execution directory. ## Embedding : train using concatenation of close places +### Prepare required data + * download the Geonames data use to train the network [here](download.geonames.org/export/dump/) + * download the hierarchy data [here](http://download.geonames.org/export/dump/hierarchy.zip) + * unzip both file in the directory of your choice + * run the script `train_test_split_geonames.py <geoname_filename>` - Toponym Combination +### Train the network - positional arguments: - geoname_input Filepath of the Geonames file you want to use. - geoname_hierachy_input - Filepath of the Geonames file you want to use. +The script `combination_embeddings.py` is the one responsible of the neural network training - optional arguments: - -h, --help show this help message and exit - -v, --verbose - -n NGRAM_SIZE, --ngram-size NGRAM_SIZE - -t TOLERANCE_VALUE, --tolerance-value TOLERANCE_VALUE - -e EPOCHS, --epochs EPOCHS - -m {CNN,LSTM}, --model {CNN,LSTM} \ No newline at end of file +To train the network with default parameter use the following command : + + python3 combination_embeddings.py -a -i <geoname data filename> <hierarchy geonames data filename> + +### Available parameters + + +| Parameter | Description | +|----------------------|----------------------------------------------------------------------| +| -i,--inclusion | Use inclusion relationships to train the network | +| -a,--adjacency | Use adjacency relationships to train the network | +| -w,--wikipedia-coo | Use Wikipedia place co-occurrences to train the network | +| -n,--ngram-size | ngram size | +| -t,--tolerance-value | K-value in the computation of the accuracy@k | +| -e,--epochs | number of epochs | +| -d,--dimension | size of the ngram embeddings | +| -m,--model | Neural Network architecture used | +| --admin_code_1 | (Optional) If you wish to train the network on a specificate region | diff --git a/combination_embeddings.py b/combination_embeddings.py index 72a8f3b..7913784 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -1,4 +1,5 @@ # Base module +import re import os import sys from argparse import ArgumentParser @@ -40,6 +41,25 @@ def tqdm(*args, **kwargs): for instance in list(tqdm_base._instances): tqdm_base._decr_instances(instance) return tqdm_base(*args, **kwargs) + + +def parse_title_wiki(title_wiki): + return re.sub("\(.*\)","",title_wiki).strip().lower() + +def get_new_ids(cooc_data,id_first_value): + topo_id = {} + id_ = id_first_value + for title in cooc_data.title.values: + if not title in topo_id: + id_+=1 + topo_id[id_]=title + for interlinks in cooc_data.interlinks.values: + for interlink in interlinks.split("|"): + if not interlink in topo_id: + id_+=1 + topo_id[id_]=interlink + return topo_id + # Logging import logging from chrono import Chronometer @@ -50,7 +70,7 @@ logging.basicConfig( ) chrono = Chronometer() -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("--admin_code_1 94 -n 2 -t 0.002 -e 100 -m LSTM -a -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) GEONAME_FN = args.geoname_input GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input @@ -65,6 +85,11 @@ if args.model == "CNN": else: LSTM_train = True +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + + # LOAD DATA logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") @@ -78,45 +103,74 @@ logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places +admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split() +region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1 +if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth: + filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() + # Geometry operation filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) filtered = gpd.GeoDataFrame(filtered) filtered["i"]=1 bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships +geoname2name = dict(filtered["geonameid name".split()].values) + + -rel_dict ={} +rel_store = [] if args.adjacency: - logging.info("Retrieve inclusion relationships ! ") - fn = "data/geonamesData/{0}_{1}_adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY) + # RETRIEVE ADJACENCY REL + logging.info("Retrieve adjacency relationships ! ") + fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn) if not os.path.exists(fn): g = Grid(*bounds,[360,180]) g.fit_data(filtered) [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] - rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])) - json.dump(rel_dict,open(fn,'w')) + rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)]) + json.dump(rel_store,open(fn,'w')) else: logging.info("Open and load data from previous computation!") - rel_dict.update({int(k):int(v) for k,v in json.load(open(fn)).items()}) - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_dict))) + rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))] + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) if args.inclusion: # RETRIEVE INCLUSION RELATIONSHIPS logging.info("Retrieve inclusion relationships ! ") - geoname2name = dict(filtered["geonameid name".split()].values) filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) - rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values)) + rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())) logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) +if args.wikipedia_cooc: + cooc_data = pd.read_csv("./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1],sep="\t") + cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) + cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) + id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max()) + wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} + # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length + +if args.wikipedia_cooc: + [index.split_and_add(x) for x in id_wikipediatitle.values()] + idwiki_encoded = {id_: index.encode(toponym) for id_,toponym in id_wikipediatitle.items()} + max_len = max(max_len,max([len(enc) for _,enc in idwiki_encoded.items()])) + +index.max_len = int(max_len) # For Index state dump + filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association + +if args.wikipedia_cooc: + idwiki_encoded = {id_: index.complete(enc,max_len) for id_,enc in idwiki_encoded.items()} + +index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1])) logging.info("Done !") #CLEAR RAM @@ -129,6 +183,8 @@ filtered["cell_vec"]=filtered.apply( axis=1 ) geoname_vec = dict(filtered["geonameid cell_vec".split()].values) +if args.wikipedia_cooc: + wikipediaid_vec = {wikipediatitle_id[title]: zero_one_encoding(*title_coord[title]) for title in cooc_data.title.values} # CLEAR RAM del filtered @@ -142,9 +198,8 @@ logging.info("Preparing Input and Output data...") X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] -for geonameId_1,geonameId_2 in rel_dict.items(): - if not geonameId_2 in rel_dict: - continue +for couple in rel_store: + geonameId_1,geonameId_2 = couple[0],couple[1] top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] if geonameId_1 in train_indices: #and geonameId_2 in train_indices: @@ -190,7 +245,7 @@ def accuracy_at_k(y_true, y_pred): fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) -name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE) +name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) if args.adjacency: name+="_A" if args.inclusion: @@ -263,4 +318,7 @@ if CONV : history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) hist_df = pd.DataFrame(history.history) -hist_df.to_csv("outputs/{0}.csv".format(name)) \ No newline at end of file +hist_df.to_csv("outputs/{0}.csv".format(name)) + +model.save("outputs/"+name+".h5") + diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index 22249bb..9f3fe94 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -6,11 +6,13 @@ { "short": "-v", "long": "--verbose", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, - {"long": "--adjacency-iteration", "type":"int","default":10}, + { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, + {"long": "--adjacency-iteration", "type":"int","default":50}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" } + { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }, + { "long": "--admin_code_1", "default": "None" } ] } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8919e09..c5b83fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -#pyroutelib3 +pyroutelib3 node2vec -#osrm +osrm geopandas pandas numpy @@ -15,3 +15,4 @@ keras ngram shapely sqlitedict +nltk \ No newline at end of file diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py new file mode 100644 index 0000000..d8f8962 --- /dev/null +++ b/train_test_split_geonames.py @@ -0,0 +1,83 @@ +import argparse + +import numpy as np +import pandas as pd +import geopandas as gpd + +import logging +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +from sklearn.model_selection import train_test_split +from shapely.geometry import Point + +from utils import Grid +from helpers import read_geonames + +from tqdm import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("geoname_file") +parser.add_argument("--feature_classes",help="List of class",default="A P") + +args = parser.parse_args()#("data/geonamesData/FR.txt".split()) + +# LOAD DATAgeopandas +GEONAME_FN = args.geoname_file +FEATURE_CLASSES = args.feature_classes + + +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places + +# World Shape bounds +world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) +world["nn"] = 1 +dissolved = world.dissolve(by="nn").iloc[0].geometry + +#Creating Grid +logging.info("Initializing Grid (360,180)...") +g = Grid(*dissolved.bounds,[360,180]) +logging.info("Fit Data to the Grid...") +g.fit_data(filtered) +logging.info("Placing place into the grid...") +[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))] + +#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME +logging.info("Associate a cell number to each place in the Geoname Dataframe") +def foo(g,id_): + for ix,cell in enumerate(g.cells): + if id_ in cell.list_object: + return ix + +filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x)) + +# TRAIN AND TEST SPLIT +logging.info("Split Between Train and Test") + +# Cell can be empty +i=0 +while 1: + if len(filtered[filtered.cat == i])> 1: + X_train,X_test = train_test_split(filtered[filtered.cat == i]) + break + i+=1 + +for i in range(i+1,len(g.cells)): + try: + x_train,x_test = train_test_split(filtered[filtered.cat == i]) + X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) + except: + pass #print("Error",len(filtered[filtered.cat == i])) + +# SAVING THE DATA +logging.info("Saving Output !") +X_train.to_csv(GEONAME_FN+"_train.csv") +X_test.to_csv(GEONAME_FN+"_test.csv") \ No newline at end of file diff --git a/utils.py b/utils.py index a468525..37eb4a7 100644 --- a/utils.py +++ b/utils.py @@ -1,16 +1,24 @@ import math import numpy as np -from stop_words import get_stop_words from nltk.tokenize import word_tokenize import textwrap from ngram import NGram + import argparse import os import json + from tqdm import tqdm +import geopandas as gpd + +from keras.layers import Embedding +from gensim.models import Word2Vec + +from joblib import Parallel,delayed +from shapely.geometry import Point,box class TokenizerCustom(): @@ -25,40 +33,113 @@ class TokenizerCustom(): return seqs -from keras.layers import Embedding -from gensim.models import Word2Vec + class NgramIndex(): + """ + Class used for encoding words in ngram representation + """ def __init__(self,n): - self.ngram_gen = NGram(N=n) + """ + Constructor + Parameters + ---------- + n : int + ngram size + """ + self.ngram_gen = NGram(N=n) + + self.size = n self.ngram_index = {} self.index_ngram = {} self.cpt = 0 self.max_len = 0 + def split_and_add(self,word): + """ + Split word in multiple ngram and add each one of them to the index + + Parameters + ---------- + word : str + a word + """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) [self.add(ngram) for ngram in ngrams] def add(self,ngram): + """ + Add a ngram to the index + + Parameters + ---------- + ngram : str + ngram + """ if not ngram in self.ngram_index: self.cpt+=1 self.ngram_index[ngram]=self.cpt self.index_ngram[self.cpt]=ngram def encode(self,word): + """ + Return a ngram representation of a word + + Parameters + ---------- + word : str + a word + + Returns + ------- + list of int + list of ngram index + """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) [self.add(ng) for ng in ngrams if not ng in self.ngram_index] return [self.ngram_index[ng] for ng in ngrams] def complete(self,ngram_encoding,MAX_LEN,filling_item=0): + """ + Complete a ngram encoded version of word with void ngram. It's necessary for neural network. + + Parameters + ---------- + ngram_encoding : list of int + first encoding of a word + MAX_LEN : int + desired length of the encoding + filling_item : int, optional + ngram index you wish to use, by default 0 + + Returns + ------- + list of int + list of ngram index + """ assert len(ngram_encoding) <= MAX_LEN diff = MAX_LEN - len(ngram_encoding) ngram_encoding.extend([filling_item]*diff) return ngram_encoding def get_embedding_layer(self,texts,dim=100,**kwargs): + """ + Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model. + + Parameters + ---------- + texts : list of [list of int] + list of encoded word + dim : int, optional + embedding dimension, by default 100 + + Returns + ------- + np.array + embedding matrix + """ model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) N = len(self.ngram_index) embedding_matrix = np.zeros((N,dim)) @@ -66,19 +147,116 @@ class NgramIndex(): embedding_matrix[i] = model.wv[str(i)] return embedding_matrix + def save(self,fn): + """ + + Save the NgramIndex + + Parameters + ---------- + fn : str + output filename + """ + data = { + "ngram_size": self.size, + "ngram_index": self.ngram_index, + "cpt_state": self.cpt, + "max_len_state": self.max_len + } + json.dump(data,open(fn,'w')) + + @staticmethod + def load(fn): + """ + + Load a NgramIndex state from a file. + + Parameters + ---------- + fn : str + input filename + + Returns + ------- + NgramIndex + ngram index + + Raises + ------ + KeyError + raised if a required field does not appear in the input file + """ + try: + data = json.load(open(fn)) + except json.JSONDecodeError: + print("Data file must be a JSON") + for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: + if not key in data: + raise KeyError("{0} field cannot be found in given file".format(key)) + new_obj = NgramIndex(data["ngram_size"]) + new_obj.ngram_index = data["ngram_index"] + new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} + new_obj.cpt = data["cpt_state"] + new_obj.max_len = data["max_len_state"] + return new_obj + + def zero_one_encoding(long,lat): + """ + Encode coordinates (WGS84) between 0 and 1 + + Parameters + ---------- + long : float + longitude value + lat : float + latitude value + + Returns + ------- + float,float + longitude, latitude + """ return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) def _split(lst,n,complete_chunk_value): + """ + Split a list into chunk of n-size. + + Parameters + ---------- + lst : list + input list + n : int + chunk size + complete_chunk_value : object + if last chunk size not equal to n, this value is used to complete it + + Returns + ------- + list + chunked list + """ chunks = [lst[i:i + n] for i in range(0, len(lst), n)] if not chunks:return chunks if len(chunks[-1]) != n: chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) return np.array(chunks) - - def generate_couple(object_list): + """ + Return a randomly selected couple from an object list. + + Parameters + ---------- + object_list : list + object list + + Returns + ------- + list + list of coupled object + """ couples = [] lst = np.arange(len(object_list)) for _ in range(len(object_list)): @@ -93,22 +271,73 @@ def generate_couple(object_list): return couples def _hash_couple(o1,o2): + """ + Return an hash for two object ids. + + Parameters + ---------- + o1 : str or int + id of the first objeeect + o2 : str of int + id of the second object + + Returns + ------- + str + hash + """ return "|".join(map(str,sorted([int(o1),int(o2)]))) ### GEO ADJAC BEGIN -from joblib import Parallel,delayed -from shapely.geometry import Point,box - class Cell(object): - def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y): + """ + A cell is box placed in geeographical space. + """ + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,x,y): + """ + Constructor + Parameters + ---------- + object : [type] + [description] + upperleft_x : float + upperleft longitude + upperleft_y : float + upperleft latitude + bottomright_x : float + bottom right longitude + bottomright_y : float + bottom right latitude + x : int + cell x coordinates in the grid + y : int + cell y coordinates in the grid + """ self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) self.list_object={} # {id:Point(coord)} - + + self.x,self.y = x, y + def contains(self,lat,lon): + """ + Return true if the cell contains a point at given coordinates + + Parameters + ---------- + lat : float + latitude + lon : float + longitude + + Returns + ------- + bool + true if contains + """ x,y = lon,lat if x < self.upperleft_x or x > self.bottomright_x: return False @@ -117,13 +346,45 @@ class Cell(object): return True def add_object(self,id_,lat,lon): + """ + Connect an object to the cell + + Parameters + ---------- + id_ : int + id + lat : float + latitude + lon : float + longitude + """ self.list_object[id_] = Point(lon,lat) def __repr__(self): return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) class Grid(object): + """ + Define a grid + + """ def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]): + """ + Constructor + + Parameters + ---------- + upperleft_x : float + upperleft longitude + upperleft_y : float + upperleft latitude + bottomright_x : float + bottom right longitude + bottomright_y : float + bottom right latitude + cell_sub_div_index : list, optional + number of division in both latitude and longitude axis (longitude first), by default [100,50] + """ self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0] @@ -142,7 +403,7 @@ class Grid(object): self.upperleft_y+i*self.y_r, self.upperleft_x+((j+1)*self.x_r), self.upperleft_y+((i+1)*self.y_r), - ) + j,i) ) dec_y = 0 for i in range(cell_sub_div_index[1]): @@ -153,34 +414,45 @@ class Grid(object): self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP self.upperleft_y+(i*self.y_r)-dec_y, self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r), - self.upperleft_y+((i+1)*self.y_r)+self.c_y_r#(self.u_neg*self.c_y_r), - ) + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) ) self.inter_cells[-1].append(Cell( self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER self.upperleft_y+(i*self.y_r)-self.c_y_r, self.upperleft_x+((j+1)*self.x_r)+self.c_x_r, self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, - ) + j,i) ) self.inter_cells[-1].append(Cell( self.upperleft_x+(j*self.x_r)+dec_x, # CENTER self.upperleft_y+(i*self.y_r)-self.c_y_r, self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT - self.upperleft_y+((i+1)*self.y_r)+self.c_y_r - ) + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) ) dec_x = self.c_x_r dec_y = self.c_y_r - def fit_data(self,data): - data["nn"] = 1 - dissolved = data.dissolve(by="nn") + def fit_data(self,data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))): + """ + + To avoid unnecessary check when connecting an entity to one or multiple cells, we + filter cells that does not appears in our geographic context (here countries surface). + + Parameters + ---------- + data : GeoDataFrame + geographic context + """ + world = data + world["nn"] = 1 + dissolved = world.dissolve(by="nn").iloc[0].geometry new_cells= [] new_inter_cells=[] for i in tqdm(range(len(self.cells))): for j in range(len(self.cells[i])): - if dissolved.intersects(self.cells[i][j].box_).all(): + if dissolved.intersects(self.cells[i][j].box_): new_cells.append(self.cells[i][j]) new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3]) @@ -188,7 +460,15 @@ class Grid(object): self.inter_cells = new_inter_cells - def __add__(self,a): + def __add__(self,a): + """ + Add an object to the grid + + Parameters + ---------- + a : tuple + (id, latitude, longitude) + """ for c1 in range(len(self.cells)): if self.cells[c1].contains(a[1],a[2]): self.cells[c1].add_object(*a) @@ -199,6 +479,19 @@ class Grid(object): break def get_adjacent_relationships(self,random_iteration=10): + """ + Return a list of adjacent relationships founds in each cell. + + Parameters + ---------- + random_iteration : int, optional + number of iteration for random selection of adjacency relationships, by default 10 + + Returns + ------- + list + adjacency relationships + """ relationships = set([]) for c1 in tqdm(range(len(self.cells))): for i in range(random_iteration): @@ -214,12 +507,6 @@ class Grid(object): ### GEO ADJAC END - - -import argparse -import os -import json - class ConfigurationReader(object): def __init__(self,configuration_file): if not os.path.exists(configuration_file): @@ -273,10 +560,6 @@ class ConfigurationReader(object): if __name__ == "__main__": - q = Quadtree(-180,90,180,-90) - hash_ = q.encode((1.2,1.3)) - q.decode(hash_) - index = NgramIndex(3) index.split_and_add("J'aime le paté") -- GitLab