Add adj rel to the process+ fatest adjacency computation using a grid

640cf7b1 · Fize Jacques · fb3e6b21 · 640cf7b1 · 640cf7b1 · 640cf7b1
Commit 640cf7b1 authored 5 years ago by Fize Jacques
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -2,10 +2,12 @@
 import os
 import sys
 from argparse import ArgumentParser
+import json

 # Structure
 import pandas as pd
 import numpy as np
+import geopandas as gpd

 # DEEPL module
 from keras.preprocessing.text import Tokenizer
@@ -25,7 +27,8 @@ from shapely.geometry import Point

 # Custom module
 from helpers import read_geonames
-from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex,ConfigurationReader
+from utils import Grid
+from utils import  zero_one_encoding, NgramIndex,ConfigurationReader


 # Visualisation module
@@ -47,10 +50,7 @@ logging.basicConfig(
    )
 chrono = Chronometer()

-args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()
-
-
-
+args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())

 GEONAME_FN = args.geoname_input
 GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
@@ -74,12 +74,35 @@ logging.info("Geonames data loaded!")
 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
 filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places

-# RETRIEVE INCLUSION RELATIONSHIPS
-logging.info("Retrieve inclusion relationships ! ")
-geoname2name = dict(filtered["geonameid name".split()].values)
-filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
-inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values)
-logging.info("{0} inclusion relationships retrieved ! ".format(len(inclusion_dict)))
+
+# RETRIEVE ADJACENCY 
+
+filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
+filtered = gpd.GeoDataFrame(filtered)
+filtered["i"]=1
+bounds = filtered.dissolve("i").bounds.values[0]
+
+rel_dict ={}
+
+if args.adjacency:
+    fn = "{0}_adjacency.json".format(GEONAME_FN.split("/")[-1])
+    if not os.path.exists(fn):
+        g = Grid(*bounds,[360,180])
+        g.fit_data(filtered)
+        [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
+        rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships()]))
+        json.dump(rel_dict,open(fn,'w'))
+    else:
+        rel_dict.update(json.load(open(fn,'w')))
+
+if args.inclusion:
+    # RETRIEVE INCLUSION RELATIONSHIPS
+    logging.info("Retrieve inclusion relationships ! ")
+    geoname2name = dict(filtered["geonameid name".split()].values)
+    filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
+    rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values))
+    logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))
+

 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
@@ -113,12 +136,12 @@ logging.info("Preparing Input and Output data...")

 X_1,X_2,y_lat,y_lon=[],[],[],[]
 X_3 = []
-for geonameId_1,geonameId_2 in inclusion_dict.items():
-    if not geonameId_2 in inclusion_dict:
+for geonameId_1,geonameId_2 in rel_dict.items():
+    if not geonameId_2 in rel_dict:
      continue
-    geonameId_3 = inclusion_dict[geonameId_2]
-    top3 = geoname2encodedname[geonameId_3]
-    X_3.append(top3)
+    geonameId_3 = rel_dict[geonameId_2]
+    # top3 = geoname2encodedname[geonameId_3]
+    # X_3.append(top3)

    top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
    X_1.append(top1)

--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
@@ -4,9 +4,12 @@
        { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." },
        { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." },
        { "short": "-v", "long": "--verbose", "action": "store_true" },
+        { "short": "-i", "long": "--inclusion", "action": "store_true" },
+        { "short": "-a", "long": "--adjacency", "action": "store_true" },
        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
+        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
        { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }
    ]
 }
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
@@ -10,7 +10,7 @@ from ngram import NGram
 import argparse
 import os
 import json
-
+from tqdm import tqdm


 class TokenizerCustom():
@@ -24,128 +24,6 @@ class TokenizerCustom():
            seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index])
        return seqs

-class CoordinatesEncoder:
-    """
-    Deprecated !
-    
-    """
-    def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5):
-        self.min_lon = -180
-        self.max_lon = -(self.min_lon) # Symetric
-        self.min_lat = -90
-        self.max_lat = -(self.min_lat) # Symetric
-        
-        self.ecart_lat = self.max_lat-self.min_lat
-        self.ecart_lon = self.max_lon-self.min_lon
-        
-        self.cell_size_lat = cell_size_lat
-        self.cell_size_lon = cell_size_lon
-        
-        self.unit_size_lat = self.ecart_lat/self.cell_size_lat 
-        self.unit_size_lon = self.ecart_lon/self.cell_size_lon 
-        
-    def encode(self,lat,lon):
-        return (
-        math.floor(((lat+self.max_lat)/self.ecart_lat)*self.unit_size_lat),
-        math.floor(((lon+self.max_lon)/self.ecart_lon)*(self.unit_size_lon))
-               )
-    
-    def number_lat_cell(self):
-        return int(self.unit_size_lat)
-    
-    def number_lon_cell(self):
-        return int(self.unit_size_lon)
-    
-    def oneDimensionOutputSize(self):
-        return self.number_lat_cell()*self.number_lon_cell()
-    
-    def vector(self,lat,lon):
-        lat_v,lon_v=np.zeros(self.number_lat_cell()),np.zeros(self.number_lon_cell())
-        new_coords = self.encode(lat,lon)
-        lat_v[int(new_coords[0])] = 1
-        lon_v[int(new_coords[1])] = 1
-        return lat_v,lon_v
-    def vector_flatten(self,lat,lon):
-        vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible
-        new_coords = self.encode(lat,lon)
-        pos = self.number_lat_cell()*(new_coords[0])+new_coords[1]
-        vec[pos] = 1 #lon * lon size
-        return vec
-
-
-class Quadtree(object):
-    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0):
-        self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
-
-        self.precision = precision
-        x_r = abs(self.bottomright_x - self.upperleft_x)/2 
-        y_r = abs(self.upperleft_y - self.bottomright_y )/2
-
-        # if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]:
-        if curr_prec == precision:
-            self.value = ""
-        else:
-            #print(ix,x_r,y_r)#print(x_r,y_r)
-            self.value = [
-                Quadtree(upperleft_x,
-                upperleft_y,
-                bottomright_x-x_r,
-                bottomright_y+y_r,
-                precision=self.precision,
-                curr_prec=curr_prec+1
-                ),
-                Quadtree(upperleft_x+x_r,
-                upperleft_y,
-                bottomright_x,
-                bottomright_y+y_r,
-                precision=self.precision,
-                curr_prec=curr_prec+1
-                ),
-                Quadtree(upperleft_x,
-                upperleft_y-y_r,
-                bottomright_x-x_r,
-                bottomright_y,
-                precision=self.precision,
-                curr_prec=curr_prec+1
-                ),
-                Quadtree(upperleft_x+x_r,
-                upperleft_y-y_r,
-                bottomright_x,
-                bottomright_y,
-                precision=self.precision,
-                curr_prec=curr_prec+1
-                )
-            ] 
-    def contains_obj(self,pos):
-        x,y = pos[0],pos[1]
-        if x < self.upperleft_x or x > self.bottomright_x:
-            return False
-        if y >self.upperleft_y or y < self.bottomright_y:
-            return False
-        return True
-
-    def binary(self,integer):
-        ch = "{0:b}".format(integer) 
-        return "0"*(2-len(ch))+ch
-
-    def encode(self,pos):
-        if not isinstance(self.value,list):
-            return ""
-        for ix,q in enumerate(self.value):
-            if q.contains_obj(pos):
-                return self.binary(ix)+q.encode(pos)
-
-    def int_encode(self,pos):
-        return list(map(int,textwrap.wrap(self.encode(pos),1)))
-
-    def decode(self,hash_):
-        if not len(hash_)%2 ==0:
-            raise ValueError("Wrong Hash ! ")
-        q_pos = eval("0b"+hash_[:2])
-        q = self.value[q_pos]
-        if len(hash_) == 2:
-            return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y
-        return q.decode(hash_[2:])

 from keras.layers import Embedding
 from gensim.models import Word2Vec
@@ -199,6 +77,145 @@ def _split(lst,n,complete_chunk_value):
    return np.array(chunks)


+
+def generate_couple(object_list):
+    couples = []
+    lst = np.arange(len(object_list))
+    for _ in range(len(object_list)):
+        if len(lst) == 1:
+            break
+        idx = np.random.choice(np.arange(len(lst)))
+        idx2 = np.random.choice(np.arange(len(lst)))
+        while idx2 == idx:
+            idx2 = np.random.choice(np.arange(len(lst)))
+        couples.append([object_list[lst[idx]],object_list[lst[idx2]]])
+        lst = np.delete(lst,idx)
+    return couples
+
+def _hash_couple(o1,o2):
+    return "|".join(map(str,sorted([int(o1),int(o2)])))
+
+
+
+### GEO ADJAC BEGIN
+from joblib import Parallel,delayed
+from shapely.geometry import Point,box
+
+class Cell(object):
+    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y):
+        
+        self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
+        self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
+        self.list_object={} # {id:Point(coord)}
+    
+    def contains(self,lat,lon):
+        x,y = lon,lat
+        if x < self.upperleft_x or x > self.bottomright_x:
+            return False
+        if y < self.upperleft_y or y > self.bottomright_y:
+            return False
+        return True
+    
+    def add_object(self,id_,lat,lon):
+        self.list_object[id_] = Point(lon,lat)
+            
+    def __repr__(self):
+        return  "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
+        
+class Grid(object):
+    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]):
+        self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
+        
+        self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0]
+        self.y_r = abs(self.upperleft_y - self.bottomright_y )/cell_sub_div_index[1]
+        
+        self.c_x_r = self.x_r/cell_sub_div_index[0] # Redivide
+        self.c_y_r = self.y_r/cell_sub_div_index[1]
+        
+        self.cells = []
+        self.inter_cells = []
+        for i in range(cell_sub_div_index[1]):
+            self.cells.append([])
+            for j in range(cell_sub_div_index[0]):
+                self.cells[-1].append(Cell(
+                    self.upperleft_x+j*self.x_r,
+                    self.upperleft_y+i*self.y_r,
+                    self.upperleft_x+((j+1)*self.x_r),
+                    self.upperleft_y+((i+1)*self.y_r),
+                    )
+                )
+        dec_y = 0 
+        for i in range(cell_sub_div_index[1]):
+            self.inter_cells.append([])
+            dec_x = 0 
+            for j in range(cell_sub_div_index[0]):                 
+                self.inter_cells[-1].append(Cell(
+                    self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP
+                    self.upperleft_y+(i*self.y_r)-dec_y,
+                    self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r),
+                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r#(self.u_neg*self.c_y_r),
+                    )
+                )
+                self.inter_cells[-1].append(Cell(
+                    self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER
+                    self.upperleft_y+(i*self.y_r)-self.c_y_r,
+                    self.upperleft_x+((j+1)*self.x_r)+self.c_x_r,
+                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
+                    )
+                )
+                self.inter_cells[-1].append(Cell(
+                    self.upperleft_x+(j*self.x_r)+dec_x, # CENTER
+                    self.upperleft_y+(i*self.y_r)-self.c_y_r,
+                    self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT
+                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r
+                    )
+                )
+                dec_x = self.c_x_r
+            dec_y = self.c_y_r
+    
+    def fit_data(self,data):
+        data["nn"] = 1
+        dissolved = data.dissolve(by="nn")
+        new_cells= []
+        new_inter_cells=[]
+        for i in tqdm(range(len(self.cells))):
+            for j in range(len(self.cells[i])):
+                if dissolved.intersects(self.cells[i][j].box_).all():
+                    new_cells.append(self.cells[i][j])
+                    new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3])
+                    
+        self.cells=new_cells
+        self.inter_cells = new_inter_cells
+        
+                    
+    def __add__(self,a):    
+        for c1 in range(len(self.cells)):
+            if self.cells[c1].contains(a[1],a[2]):
+                self.cells[c1].add_object(*a)
+                
+        for c1 in range(len(self.inter_cells)):
+            if self.inter_cells[c1].contains(a[1],a[2]):
+                self.inter_cells[c1].add_object(*a)
+                break
+                
+    def get_adjacent_relationships(self,random_iteration=10):
+        relationships = set([])
+        for c1 in tqdm(range(len(self.cells))):
+            for i in range(random_iteration):
+                for t in generate_couple(list(self.cells[c1].list_object.keys())):
+                    relationships.add(_hash_couple(t[0],t[1]))
+
+        for c1 in tqdm(range(len(self.inter_cells))):
+            for i in range(random_iteration):
+                for t in generate_couple(list(self.inter_cells[c1].list_object.keys())):
+                    relationships.add(_hash_couple(t[0],t[1]))
+        return relationships
+    
+
+### GEO ADJAC END
+
+
+
 import argparse
 import os
 import json