Skip to content
Snippets Groups Projects
Commit 640cf7b1 authored by Fize Jacques's avatar Fize Jacques
Browse files

Add adj rel to the process+ fatest adjacency computation using a grid

parent fb3e6b21
No related branches found
No related tags found
No related merge requests found
......@@ -2,10 +2,12 @@
import os
import sys
from argparse import ArgumentParser
import json
# Structure
import pandas as pd
import numpy as np
import geopandas as gpd
# DEEPL module
from keras.preprocessing.text import Tokenizer
......@@ -25,7 +27,8 @@ from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex,ConfigurationReader
from utils import Grid
from utils import zero_one_encoding, NgramIndex,ConfigurationReader
# Visualisation module
......@@ -47,10 +50,7 @@ logging.basicConfig(
)
chrono = Chronometer()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
GEONAME_FN = args.geoname_input
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
......@@ -74,12 +74,35 @@ logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
# RETRIEVE INCLUSION RELATIONSHIPS
logging.info("Retrieve inclusion relationships ! ")
geoname2name = dict(filtered["geonameid name".split()].values)
filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values)
logging.info("{0} inclusion relationships retrieved ! ".format(len(inclusion_dict)))
# RETRIEVE ADJACENCY
filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
filtered = gpd.GeoDataFrame(filtered)
filtered["i"]=1
bounds = filtered.dissolve("i").bounds.values[0]
rel_dict ={}
if args.adjacency:
fn = "{0}_adjacency.json".format(GEONAME_FN.split("/")[-1])
if not os.path.exists(fn):
g = Grid(*bounds,[360,180])
g.fit_data(filtered)
[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships()]))
json.dump(rel_dict,open(fn,'w'))
else:
rel_dict.update(json.load(open(fn,'w')))
if args.inclusion:
# RETRIEVE INCLUSION RELATIONSHIPS
logging.info("Retrieve inclusion relationships ! ")
geoname2name = dict(filtered["geonameid name".split()].values)
filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values))
logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
......@@ -113,12 +136,12 @@ logging.info("Preparing Input and Output data...")
X_1,X_2,y_lat,y_lon=[],[],[],[]
X_3 = []
for geonameId_1,geonameId_2 in inclusion_dict.items():
if not geonameId_2 in inclusion_dict:
for geonameId_1,geonameId_2 in rel_dict.items():
if not geonameId_2 in rel_dict:
continue
geonameId_3 = inclusion_dict[geonameId_2]
top3 = geoname2encodedname[geonameId_3]
X_3.append(top3)
geonameId_3 = rel_dict[geonameId_2]
# top3 = geoname2encodedname[geonameId_3]
# X_3.append(top3)
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
X_1.append(top1)
......
......@@ -4,9 +4,12 @@
{ "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." },
{ "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." },
{ "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" },
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }
]
}
\ No newline at end of file
......@@ -10,7 +10,7 @@ from ngram import NGram
import argparse
import os
import json
from tqdm import tqdm
class TokenizerCustom():
......@@ -24,128 +24,6 @@ class TokenizerCustom():
seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index])
return seqs
class CoordinatesEncoder:
"""
Deprecated !
"""
def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5):
self.min_lon = -180
self.max_lon = -(self.min_lon) # Symetric
self.min_lat = -90
self.max_lat = -(self.min_lat) # Symetric
self.ecart_lat = self.max_lat-self.min_lat
self.ecart_lon = self.max_lon-self.min_lon
self.cell_size_lat = cell_size_lat
self.cell_size_lon = cell_size_lon
self.unit_size_lat = self.ecart_lat/self.cell_size_lat
self.unit_size_lon = self.ecart_lon/self.cell_size_lon
def encode(self,lat,lon):
return (
math.floor(((lat+self.max_lat)/self.ecart_lat)*self.unit_size_lat),
math.floor(((lon+self.max_lon)/self.ecart_lon)*(self.unit_size_lon))
)
def number_lat_cell(self):
return int(self.unit_size_lat)
def number_lon_cell(self):
return int(self.unit_size_lon)
def oneDimensionOutputSize(self):
return self.number_lat_cell()*self.number_lon_cell()
def vector(self,lat,lon):
lat_v,lon_v=np.zeros(self.number_lat_cell()),np.zeros(self.number_lon_cell())
new_coords = self.encode(lat,lon)
lat_v[int(new_coords[0])] = 1
lon_v[int(new_coords[1])] = 1
return lat_v,lon_v
def vector_flatten(self,lat,lon):
vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible
new_coords = self.encode(lat,lon)
pos = self.number_lat_cell()*(new_coords[0])+new_coords[1]
vec[pos] = 1 #lon * lon size
return vec
class Quadtree(object):
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0):
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
self.precision = precision
x_r = abs(self.bottomright_x - self.upperleft_x)/2
y_r = abs(self.upperleft_y - self.bottomright_y )/2
# if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]:
if curr_prec == precision:
self.value = ""
else:
#print(ix,x_r,y_r)#print(x_r,y_r)
self.value = [
Quadtree(upperleft_x,
upperleft_y,
bottomright_x-x_r,
bottomright_y+y_r,
precision=self.precision,
curr_prec=curr_prec+1
),
Quadtree(upperleft_x+x_r,
upperleft_y,
bottomright_x,
bottomright_y+y_r,
precision=self.precision,
curr_prec=curr_prec+1
),
Quadtree(upperleft_x,
upperleft_y-y_r,
bottomright_x-x_r,
bottomright_y,
precision=self.precision,
curr_prec=curr_prec+1
),
Quadtree(upperleft_x+x_r,
upperleft_y-y_r,
bottomright_x,
bottomright_y,
precision=self.precision,
curr_prec=curr_prec+1
)
]
def contains_obj(self,pos):
x,y = pos[0],pos[1]
if x < self.upperleft_x or x > self.bottomright_x:
return False
if y >self.upperleft_y or y < self.bottomright_y:
return False
return True
def binary(self,integer):
ch = "{0:b}".format(integer)
return "0"*(2-len(ch))+ch
def encode(self,pos):
if not isinstance(self.value,list):
return ""
for ix,q in enumerate(self.value):
if q.contains_obj(pos):
return self.binary(ix)+q.encode(pos)
def int_encode(self,pos):
return list(map(int,textwrap.wrap(self.encode(pos),1)))
def decode(self,hash_):
if not len(hash_)%2 ==0:
raise ValueError("Wrong Hash ! ")
q_pos = eval("0b"+hash_[:2])
q = self.value[q_pos]
if len(hash_) == 2:
return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y
return q.decode(hash_[2:])
from keras.layers import Embedding
from gensim.models import Word2Vec
......@@ -199,6 +77,145 @@ def _split(lst,n,complete_chunk_value):
return np.array(chunks)
def generate_couple(object_list):
couples = []
lst = np.arange(len(object_list))
for _ in range(len(object_list)):
if len(lst) == 1:
break
idx = np.random.choice(np.arange(len(lst)))
idx2 = np.random.choice(np.arange(len(lst)))
while idx2 == idx:
idx2 = np.random.choice(np.arange(len(lst)))
couples.append([object_list[lst[idx]],object_list[lst[idx2]]])
lst = np.delete(lst,idx)
return couples
def _hash_couple(o1,o2):
return "|".join(map(str,sorted([int(o1),int(o2)])))
### GEO ADJAC BEGIN
from joblib import Parallel,delayed
from shapely.geometry import Point,box
class Cell(object):
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y):
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
self.list_object={} # {id:Point(coord)}
def contains(self,lat,lon):
x,y = lon,lat
if x < self.upperleft_x or x > self.bottomright_x:
return False
if y < self.upperleft_y or y > self.bottomright_y:
return False
return True
def add_object(self,id_,lat,lon):
self.list_object[id_] = Point(lon,lat)
def __repr__(self):
return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
class Grid(object):
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]):
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0]
self.y_r = abs(self.upperleft_y - self.bottomright_y )/cell_sub_div_index[1]
self.c_x_r = self.x_r/cell_sub_div_index[0] # Redivide
self.c_y_r = self.y_r/cell_sub_div_index[1]
self.cells = []
self.inter_cells = []
for i in range(cell_sub_div_index[1]):
self.cells.append([])
for j in range(cell_sub_div_index[0]):
self.cells[-1].append(Cell(
self.upperleft_x+j*self.x_r,
self.upperleft_y+i*self.y_r,
self.upperleft_x+((j+1)*self.x_r),
self.upperleft_y+((i+1)*self.y_r),
)
)
dec_y = 0
for i in range(cell_sub_div_index[1]):
self.inter_cells.append([])
dec_x = 0
for j in range(cell_sub_div_index[0]):
self.inter_cells[-1].append(Cell(
self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP
self.upperleft_y+(i*self.y_r)-dec_y,
self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r),
self.upperleft_y+((i+1)*self.y_r)+self.c_y_r#(self.u_neg*self.c_y_r),
)
)
self.inter_cells[-1].append(Cell(
self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER
self.upperleft_y+(i*self.y_r)-self.c_y_r,
self.upperleft_x+((j+1)*self.x_r)+self.c_x_r,
self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
)
)
self.inter_cells[-1].append(Cell(
self.upperleft_x+(j*self.x_r)+dec_x, # CENTER
self.upperleft_y+(i*self.y_r)-self.c_y_r,
self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT
self.upperleft_y+((i+1)*self.y_r)+self.c_y_r
)
)
dec_x = self.c_x_r
dec_y = self.c_y_r
def fit_data(self,data):
data["nn"] = 1
dissolved = data.dissolve(by="nn")
new_cells= []
new_inter_cells=[]
for i in tqdm(range(len(self.cells))):
for j in range(len(self.cells[i])):
if dissolved.intersects(self.cells[i][j].box_).all():
new_cells.append(self.cells[i][j])
new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3])
self.cells=new_cells
self.inter_cells = new_inter_cells
def __add__(self,a):
for c1 in range(len(self.cells)):
if self.cells[c1].contains(a[1],a[2]):
self.cells[c1].add_object(*a)
for c1 in range(len(self.inter_cells)):
if self.inter_cells[c1].contains(a[1],a[2]):
self.inter_cells[c1].add_object(*a)
break
def get_adjacent_relationships(self,random_iteration=10):
relationships = set([])
for c1 in tqdm(range(len(self.cells))):
for i in range(random_iteration):
for t in generate_couple(list(self.cells[c1].list_object.keys())):
relationships.add(_hash_couple(t[0],t[1]))
for c1 in tqdm(range(len(self.inter_cells))):
for i in range(random_iteration):
for t in generate_couple(list(self.inter_cells[c1].list_object.keys())):
relationships.add(_hash_couple(t[0],t[1]))
return relationships
### GEO ADJAC END
import argparse
import os
import json
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment