From 640cf7b152b6bd9ac124a1b7840850fbf86c9655 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Tue, 21 Jan 2020 16:51:32 +0100 Subject: [PATCH] Add adj rel to the process+ fatest adjacency computation using a grid --- combination_embeddings.py | 55 ++-- .../toponym_combination_embedding.json | 3 + utils.py | 263 ++++++++++-------- 3 files changed, 182 insertions(+), 139 deletions(-) diff --git a/combination_embeddings.py b/combination_embeddings.py index ef5bdc0..10b53d4 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -2,10 +2,12 @@ import os import sys from argparse import ArgumentParser +import json #Â Structure import pandas as pd import numpy as np +import geopandas as gpd #Â DEEPL module from keras.preprocessing.text import Tokenizer @@ -25,7 +27,8 @@ from shapely.geometry import Point #Â Custom module from helpers import read_geonames -from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex,ConfigurationReader +from utils import Grid +from utils import zero_one_encoding, NgramIndex,ConfigurationReader #Â Visualisation module @@ -47,10 +50,7 @@ logging.basicConfig( ) chrono = Chronometer() -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args() - - - +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) GEONAME_FN = args.geoname_input GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input @@ -74,12 +74,35 @@ logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -# RETRIEVE INCLUSION RELATIONSHIPS -logging.info("Retrieve inclusion relationships ! ") -geoname2name = dict(filtered["geonameid name".split()].values) -filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) -inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values) -logging.info("{0} inclusion relationships retrieved ! ".format(len(inclusion_dict))) + +# RETRIEVE ADJACENCY + +filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +filtered = gpd.GeoDataFrame(filtered) +filtered["i"]=1 +bounds = filtered.dissolve("i").bounds.values[0] + +rel_dict ={} + +if args.adjacency: + fn = "{0}_adjacency.json".format(GEONAME_FN.split("/")[-1]) + if not os.path.exists(fn): + g = Grid(*bounds,[360,180]) + g.fit_data(filtered) + [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] + rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships()])) + json.dump(rel_dict,open(fn,'w')) + else: + rel_dict.update(json.load(open(fn,'w'))) + +if args.inclusion: + # RETRIEVE INCLUSION RELATIONSHIPS + logging.info("Retrieve inclusion relationships ! ") + geoname2name = dict(filtered["geonameid name".split()].values) + filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) + rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values)) + logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) + # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") @@ -113,12 +136,12 @@ logging.info("Preparing Input and Output data...") X_1,X_2,y_lat,y_lon=[],[],[],[] X_3 = [] -for geonameId_1,geonameId_2 in inclusion_dict.items(): - if not geonameId_2 in inclusion_dict: +for geonameId_1,geonameId_2 in rel_dict.items(): + if not geonameId_2 in rel_dict: continue - geonameId_3 = inclusion_dict[geonameId_2] - top3 = geoname2encodedname[geonameId_3] - X_3.append(top3) + geonameId_3 = rel_dict[geonameId_2] + # top3 = geoname2encodedname[geonameId_3] + # X_3.append(top3) top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] X_1.append(top1) diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index bd8fb0f..7a39df6 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -4,9 +4,12 @@ { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." }, { "short": "-v", "long": "--verbose", "action": "store_true" }, + { "short": "-i", "long": "--inclusion", "action": "store_true" }, + { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, + { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" } ] } \ No newline at end of file diff --git a/utils.py b/utils.py index bf77273..1827b58 100644 --- a/utils.py +++ b/utils.py @@ -10,7 +10,7 @@ from ngram import NGram import argparse import os import json - +from tqdm import tqdm class TokenizerCustom(): @@ -24,128 +24,6 @@ class TokenizerCustom(): seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index]) return seqs -class CoordinatesEncoder: - """ - Deprecated ! - - """ - def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5): - self.min_lon = -180 - self.max_lon = -(self.min_lon) #Â Symetric - self.min_lat = -90 - self.max_lat = -(self.min_lat) # Symetric - - self.ecart_lat = self.max_lat-self.min_lat - self.ecart_lon = self.max_lon-self.min_lon - - self.cell_size_lat = cell_size_lat - self.cell_size_lon = cell_size_lon - - self.unit_size_lat = self.ecart_lat/self.cell_size_lat - self.unit_size_lon = self.ecart_lon/self.cell_size_lon - - def encode(self,lat,lon): - return ( - math.floor(((lat+self.max_lat)/self.ecart_lat)*self.unit_size_lat), - math.floor(((lon+self.max_lon)/self.ecart_lon)*(self.unit_size_lon)) - ) - - def number_lat_cell(self): - return int(self.unit_size_lat) - - def number_lon_cell(self): - return int(self.unit_size_lon) - - def oneDimensionOutputSize(self): - return self.number_lat_cell()*self.number_lon_cell() - - def vector(self,lat,lon): - lat_v,lon_v=np.zeros(self.number_lat_cell()),np.zeros(self.number_lon_cell()) - new_coords = self.encode(lat,lon) - lat_v[int(new_coords[0])] = 1 - lon_v[int(new_coords[1])] = 1 - return lat_v,lon_v - def vector_flatten(self,lat,lon): - vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible - new_coords = self.encode(lat,lon) - pos = self.number_lat_cell()*(new_coords[0])+new_coords[1] - vec[pos] = 1 #lon * lon size - return vec - - -class Quadtree(object): - def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0): - self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y - - self.precision = precision - x_r = abs(self.bottomright_x - self.upperleft_x)/2 - y_r = abs(self.upperleft_y - self.bottomright_y )/2 - - # if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]: - if curr_prec == precision: - self.value = "" - else: - #print(ix,x_r,y_r)#print(x_r,y_r) - self.value = [ - Quadtree(upperleft_x, - upperleft_y, - bottomright_x-x_r, - bottomright_y+y_r, - precision=self.precision, - curr_prec=curr_prec+1 - ), - Quadtree(upperleft_x+x_r, - upperleft_y, - bottomright_x, - bottomright_y+y_r, - precision=self.precision, - curr_prec=curr_prec+1 - ), - Quadtree(upperleft_x, - upperleft_y-y_r, - bottomright_x-x_r, - bottomright_y, - precision=self.precision, - curr_prec=curr_prec+1 - ), - Quadtree(upperleft_x+x_r, - upperleft_y-y_r, - bottomright_x, - bottomright_y, - precision=self.precision, - curr_prec=curr_prec+1 - ) - ] - def contains_obj(self,pos): - x,y = pos[0],pos[1] - if x < self.upperleft_x or x > self.bottomright_x: - return False - if y >self.upperleft_y or y < self.bottomright_y: - return False - return True - - def binary(self,integer): - ch = "{0:b}".format(integer) - return "0"*(2-len(ch))+ch - - def encode(self,pos): - if not isinstance(self.value,list): - return "" - for ix,q in enumerate(self.value): - if q.contains_obj(pos): - return self.binary(ix)+q.encode(pos) - - def int_encode(self,pos): - return list(map(int,textwrap.wrap(self.encode(pos),1))) - - def decode(self,hash_): - if not len(hash_)%2 ==0: - raise ValueError("Wrong Hash ! ") - q_pos = eval("0b"+hash_[:2]) - q = self.value[q_pos] - if len(hash_) == 2: - return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y - return q.decode(hash_[2:]) from keras.layers import Embedding from gensim.models import Word2Vec @@ -199,6 +77,145 @@ def _split(lst,n,complete_chunk_value): return np.array(chunks) + +def generate_couple(object_list): + couples = [] + lst = np.arange(len(object_list)) + for _ in range(len(object_list)): + if len(lst) == 1: + break + idx = np.random.choice(np.arange(len(lst))) + idx2 = np.random.choice(np.arange(len(lst))) + while idx2 == idx: + idx2 = np.random.choice(np.arange(len(lst))) + couples.append([object_list[lst[idx]],object_list[lst[idx2]]]) + lst = np.delete(lst,idx) + return couples + +def _hash_couple(o1,o2): + return "|".join(map(str,sorted([int(o1),int(o2)]))) + + + +### GEO ADJAC BEGIN +from joblib import Parallel,delayed +from shapely.geometry import Point,box + +class Cell(object): + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y): + + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) + self.list_object={} # {id:Point(coord)} + + def contains(self,lat,lon): + x,y = lon,lat + if x < self.upperleft_x or x > self.bottomright_x: + return False + if y < self.upperleft_y or y > self.bottomright_y: + return False + return True + + def add_object(self,id_,lat,lon): + self.list_object[id_] = Point(lon,lat) + + def __repr__(self): + return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) + +class Grid(object): + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]): + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + + self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0] + self.y_r = abs(self.upperleft_y - self.bottomright_y )/cell_sub_div_index[1] + + self.c_x_r = self.x_r/cell_sub_div_index[0] # Redivide + self.c_y_r = self.y_r/cell_sub_div_index[1] + + self.cells = [] + self.inter_cells = [] + for i in range(cell_sub_div_index[1]): + self.cells.append([]) + for j in range(cell_sub_div_index[0]): + self.cells[-1].append(Cell( + self.upperleft_x+j*self.x_r, + self.upperleft_y+i*self.y_r, + self.upperleft_x+((j+1)*self.x_r), + self.upperleft_y+((i+1)*self.y_r), + ) + ) + dec_y = 0 + for i in range(cell_sub_div_index[1]): + self.inter_cells.append([]) + dec_x = 0 + for j in range(cell_sub_div_index[0]): + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP + self.upperleft_y+(i*self.y_r)-dec_y, + self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r), + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r#(self.u_neg*self.c_y_r), + ) + ) + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER + self.upperleft_y+(i*self.y_r)-self.c_y_r, + self.upperleft_x+((j+1)*self.x_r)+self.c_x_r, + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + ) + ) + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)+dec_x, # CENTER + self.upperleft_y+(i*self.y_r)-self.c_y_r, + self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r + ) + ) + dec_x = self.c_x_r + dec_y = self.c_y_r + + def fit_data(self,data): + data["nn"] = 1 + dissolved = data.dissolve(by="nn") + new_cells= [] + new_inter_cells=[] + for i in tqdm(range(len(self.cells))): + for j in range(len(self.cells[i])): + if dissolved.intersects(self.cells[i][j].box_).all(): + new_cells.append(self.cells[i][j]) + new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3]) + + self.cells=new_cells + self.inter_cells = new_inter_cells + + + def __add__(self,a): + for c1 in range(len(self.cells)): + if self.cells[c1].contains(a[1],a[2]): + self.cells[c1].add_object(*a) + + for c1 in range(len(self.inter_cells)): + if self.inter_cells[c1].contains(a[1],a[2]): + self.inter_cells[c1].add_object(*a) + break + + def get_adjacent_relationships(self,random_iteration=10): + relationships = set([]) + for c1 in tqdm(range(len(self.cells))): + for i in range(random_iteration): + for t in generate_couple(list(self.cells[c1].list_object.keys())): + relationships.add(_hash_couple(t[0],t[1])) + + for c1 in tqdm(range(len(self.inter_cells))): + for i in range(random_iteration): + for t in generate_couple(list(self.inter_cells[c1].list_object.keys())): + relationships.add(_hash_couple(t[0],t[1])) + return relationships + + +### GEO ADJAC END + + + import argparse import os import json -- GitLab