DEBUG + MOVE GEONAME EMBEDDING TO ANOTHER REPO

5bb71e79 · Jacques Fize · 486a0f91 · 5bb71e79 · 5bb71e79 · 486a0f91
Commit 5bb71e79 authored 5 years ago by Jacques Fize
--- a/.gitignore
+++ b/.gitignore
@@ -146,4 +146,6 @@ test_comb.sh
 notes.md
 .idea/*
 .vscode/*
\ No newline at end of file
+other/*
+test*
\ No newline at end of file
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -21,6 +21,7 @@ from shapely.geometry import Point
 from helpers import read_geonames
 from utils import Grid
 from utils import  zero_one_encoding, NgramIndex,ConfigurationReader
+from metrics import lat_accuracy,lon_accuracy
 # Logging
 from tqdm import tqdm
@@ -73,41 +74,7 @@ def get_new_ids(cooc_data,id_first_value):
                topo_id[id_]=interlink
    return topo_id
-def lat_accuracy(LAT_TOL =1/180.):
-    def accuracy_at_k_lat(y_true, y_pred):
-        """
-        Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
-        task for neural network to obtain the exact  coordinate.
-        Parameters
-        ----------
-        y_true : tf.Tensor
-            truth data
-        y_pred : tf.Tensor
-            predicted output
-        """
-        diff = tf.abs(y_true - y_pred)
-        fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
-        return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
-    return accuracy_at_k_lat
-def lon_accuracy(LON_TOL=1/360.):
-    def accuracy_at_k_lon(y_true, y_pred):
-        """
-        Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
-        task for neural network to obtain the exact  coordinate.
-        Parameters
-        ----------
-        y_true : tf.Tensor
-            truth data
-        y_pred : tf.Tensor
-            predicted output
-        """
-        diff = tf.abs(y_true - y_pred)
-        fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
-        return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
-    return accuracy_at_k_lon
 # LOGGING CONF
 logging.basicConfig(
@@ -118,7 +85,7 @@ logging.basicConfig(
 chrono = Chronometer()
 args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
-    .parse_args("-n 4 -t 0.002 -e 20  -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
+    .parse_args()#("-n 4 -t 0.002 -e 20  -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
 # Initialisee CONSTANTS
 GEONAME_FN = args.geoname_input

--- a/geonames_embedding.py
+++ b/geonames_embedding.py
-# PYTHON MODULE
-import math
-import random
-from argparse import ArgumentParser
-from multiprocessing import cpu_count
-from argparse import RawTextHelpFormatter
-# COMMON DATA STRUCTURE MODULE
-import numpy as np
-import networkx as nx
-# SPATIAL DATA MANIPULATION
-import geopandas as gpd
-import osrm
-osrm.RequestConfig.host = "jacquesfize.com:5000"
-from shapely.geometry import Point
-# Machine Learning MODULE
-from node2vec import Node2Vec
-# VISUALISATION MODULE
-from tqdm import tqdm
-tqdm.pandas()
-# PERSONAL FUNCTION
-from helpers import *
-parser = ArgumentParser(description='Generate a spatial embedding of places using Geonames data', formatter_class=RawTextHelpFormatter)
-parser.add_argument("input")
-parser.add_argument("--nbcpu",type=int,default=cpu_count())
-parser.add_argument("--vector-size",type=int,default=64,help="Output Vector Dimension")
-parser.add_argument("--walk-length",type=int,default=30, help="Size of the walk generated during the Node2vec algorithm")
-parser.add_argument("--num-walks",type=int,default=200, help="Number of walk generated during the Node2vec algorithm")
-parser.add_argument("--word2vec-window-size",type=int,default=30, help="Window size used in the Word2vec algorithm")
-parser.add_argument("--buffer-size",type=float,default=0.03,help="Buffer size to transform Point in Polygon. Used for adjacency matrix computation.")
-parser.add_argument("-d",action="store_true",help="Integrate the distance weight between vertices")
-parser.add_argument("--dist",choices=["euclidean","itinerary"],default="itinerary",help="""Two distance functions are available:
- - Euclidean : Euclidean distance between the two places centroids
- - Itinerary : Compute the itinerary distance between two places using an OSRM service 
-""")
-parser.add_argument("--noise",action="store_true")
-parser.add_argument("--noise-size",type=int,default=500)
-args = parser.parse_args()
-# INPUT DATA
-GEONAMES_FN = args.input
-# PARALLELISM OPTION
-NUMBER_OF_CPU_USED = args.nbcpu
-# Graph Embedding parameter
-VECTOR_SIZE = args.vector_size
-WALK_LENGTH = args.walk_length
-NUMBER_OF_WALK = args.num_walks
-WORD2VEC_WINDOW = args.word2vec_window_size
-# GRAPH WEIGHT PARAMETER
-IS_DISTANCE = args.d
-DISTANCE = args.dist
-# if simulation of new toponyms
-GEO_DISTANCE_COEF = 0.5
-EMBEDDING_DISTANCE_COEF = 0.5
-# New toponym simulation
-IS_NOISE = args.noise
-NUMBER_OF_NODE_DESPATIALIZED = args.noise_size
-# DISTANCE CACHE STORAGE
-from sqlitedict import SqliteDict
-distance_dict = SqliteDict('./data/distance_dict.sqlite', autocommit=True)
-# LOAD GEONAMES DATA
-df = read_geonames(GEONAMES_FN)
-df["geometry"] = df["latitude longitude".split()].progress_apply(lambda x:Point(x.longitude,x.latitude),axis=1)
-# Create GeoDataFrame for faster spatial comparison operations
-gdf = gpd.GeoDataFrame(df)
-# Select a sample that concerns the departement "La Manche"
-manche_gdf = gdf[gdf.admin2_code == "50"]
-manche_gdf["geometry"]=manche_gdf.geometry.buffer(0.03)
-manche_gdf.plot()
-# plt.show()
-# Build a Adjacency matrix to generate the graph used for the embedding generation
-N = len(manche_gdf)
-adjacency_matrix = np.zeros((N,N))
-geometries = manche_gdf.geometry.values
-for i in tqdm(range(N)):
-    for j in range(i,N):
-        adjacency_matrix[i,j] = geometries[i].intersects(geometries[j])
-        adjacency_matrix[j,i] = adjacency_matrix[i,j]
-plt.clf()
-plt.imshow(adjacency_matrix);plt.colorbar()
-# plt.show()
-# Mapping id between matrix and geonameid
-manche_gdf["code_matrix"]=np.arange(N)
-geoname_id2idxmat = dict(manche_gdf["geonameid code_matrix".split()].values)
-idxmat2geoname_id = {v:k for k,v in geoname_id2idxmat.items()}
-# Add adjacent entity found in the Geodataframe
-def get_adjacent_entity(x):
-    idxs = np.nonzero(adjacency_matrix[geoname_id2idxmat[x]])[0]
-    return [idxmat2geoname_id[idx] for idx in idxs if not idxmat2geoname_id[idx] == x] # take not itself
-manche_gdf["adjacent_entity"]=manche_gdf.geonameid.apply(get_adjacent_entity)
-# Code for getting the distance using the road network (not euclidean) PART 1
-manche_gdf["geometry_centroid"]=manche_gdf.centroid
-coords = dict(manche_gdf["geonameid geometry_centroid".split()].values)
-# Code for getting the distance using the road network (not euclidean) PART 2
-# Run ORSM SERVER
-#https://hub.docker.com/r/osrm/osrm-backend/
-#docker run -t -p 5000:5000 -v $(pwd):/data osrm/osrm-backend osrm-extract -p /opt/car.lua /data/road.pbf
-#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/road.osrm
-#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/road.osrm
-#docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/road.osrm
-# Check Also : https://github.com/ustroetz/python-osrm#route
-# Test:  curl 'http://<yourserver>:5000/route/v1/driving/49.38,-1.37;49,-1.37?steps=true'
-def getTupCoords(id_):
-    return [coords[id_].x,coords[id_].y]
-def getDistance(id_1,id_2):
-    try:
-        return osrm.simple_route(getTupCoords(id_1), getTupCoords(id_2), output="route", overview="full",steps=False,geometry="wkt")[0]["distance"]
-    except IndexError:
-        return -1
-def signature(id_1,id2):
-    return "_".join([str(id_)for id_ in sorted([id_1,id2])])
-def getDistanceSDict(id_1,id_2,sqlite_dict):
-    hash_ = signature(id_1,id_2)
-    if not hash_ in sqlite_dict:
-        sqlite_dict[hash_]=getDistance(id_1,id_2)
-    return sqlite_dict[hash_]
-from joblib import Parallel,delayed # for parallel job computation
-def job(G,row,adjacent):
-    new_edge = (row.geonameid,adjacent)
-    if not G.has_edge(*new_edge):
-        if IS_DISTANCE and DISTANCE == "itinerary":
-            return (*new_edge,getDistanceSDict(new_edge[0],new_edge[1],distance_dict))
-        elif IS_DISTANCE and DISTANCE == "euclidean":
-            raise NotImplementedError()
-        else:
-            return 1
-# Using Route Distance
-G = nx.Graph()
-for ix,row in tqdm(manche_gdf["geonameid adjacent_entity".split()].iterrows(),total=len(manche_gdf)):
-    new_edges = Parallel(n_jobs=4,backend="threading")(delayed(job)(G,row,adjacent) for adjacent in row.adjacent_entity)
-    for edge in new_edges:
-        if edge:
-            G.add_edge(edge[0],edge[1],weight=edge[2])
-# Data for graph projection
-lon_dict= dict(manche_gdf["geonameid longitude".split()].values)
-lat_dict= dict(manche_gdf["geonameid latitude".split()].values)
-pos= {n:[lon_dict[n],lat_dict[n]]for n in G.nodes()}
-nx.draw(G,pos=pos,node_size=1)
-# plt.show()
-for ed in list(G.edges()):
-    G[ed[0]][ed[1]]["weight"]+=1 # problem when G[ed[0]][ed[1]]["weight"]==0:
-if IS_NOISE:
-    H = G.copy()
-    edges,weights = zip(*nx.get_edge_attributes(H,'weight').items()) 
-    sample = random.sample(list(H.nodes()),NUMBER_OF_NODE_DESPATIALIZED)
-    H.remove_nodes_from(sample)
-    pos= {n:[lon_dict[n],lat_dict[n]] for n in H.nodes()}
-    nx.draw(H,pos=pos,node_size=1)
-    # plt.show()
-    label_dict = dict(manche_gdf["geonameid name".split()].values)
-    embeddings = dict(pd.read_msgpack("data/embeddings/geonamesFRWithEmbeddings.msg")["geonameid embedding".split()].values)
-    ids,emb = zip(*embeddings.items())
-    id2geonameid = dict(enumerate(ids))
-    geonameid2id = {id_:ix for ix, id_ in enumerate(ids) }
-    emb_matrix = np.asarray(emb)
-    from sklearn.metrics.pairwise import cosine_similarity
-    sim_matrix = cosine_similarity(emb)
-    top_n = np.argsort(sim_matrix)[:,-3:]
-    for ix,n in enumerate(sample):
-        top_i = [id2geonameid[top] for top in top_n[geonameid2id[n]]]
-        weights_i = [sim_matrix[geonameid2id[n]][top]for top in top_n[geonameid2id[n]]]
-        for ij,top_ij in enumerate(top_i):
-            H.add_edge(n,top_ij,weight=weights_i[ij])
-G = H.copy()
-node2vec = Node2Vec(G, dimensions=VECTOR_SIZE, walk_length=WALK_LENGTH, num_walks=NUMBER_OF_WALK, workers=NUMBER_OF_CPU_USED,temp_folder="temp")  # Use temp_folder for big graphs
-model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_CPU_USED)  
-# Saving the embedding model
-if not IS_NOISE:
-    model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
-    walk_l = WALK_LENGTH,
-    num_walk = NUMBER_OF_WALK,
-    window = WORD2VEC_WINDOW,filename=GEONAMES_FN.split("/")[-1] ))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
-else:
-    model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
-    walk_l = WALK_LENGTH,
-    num_walk = NUMBER_OF_WALK,
-    window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED,filename=GEONAMES_FN.split("/")[-1]))
\ No newline at end of file
--- a/helpers.py
+++ b/helpers.py
-import os
-import time
-import pandas as pd
-import matplotlib.pyplot as plt
-def read_geonames(file):
-    """
-    Return a dataframe that contains Geonames data.
-    Parameters
-    ----------
-    file : str
-        path of the Geonames Csv file
-    Returns
-    -------
-    pd.DataFrame
-        geonames data
-    """
-    dtypes_dict = {
-    0: int, # geonameid
-    1: str,  # name
-    2: str,  # asciiname
-    3: str,  # alternatenames
-    4: float, # latitude
-    5: float, # longitude
-    6: str, # feature class
-    7: str, # feature code
-    8: str, # country code
-    9: str, # cc2
-    10: str, # admin1 code
-    11: str, # admin2 code
-    12: str, # admin3 code
-    13: str, # admin4 code
-    14: int, # population
-    15: str, # elevation
-    16: int, # dem (digital elevation model)
-    17: str, # timezone
-    18: str # modification date yyyy-MM-dd
-    }
-    rename_cols = {
-    0:"geonameid", # geonameid
-    1:"name",  # name
-    2:"asciiname",  # asciiname
-    3:"alternatenames",  # alternatenames
-    4:"latitude", # latitude
-    5:"longitude", # longitude
-    6:"feature_class", # feature class
-    7:"feature_code", # feature code
-    8:"country_code", # country code
-    9:"cc2", # cc2
-    10:"admin1_code", # admin1 code
-    11:"admin2_code", # admin2 code
-    12:"admin3_code", # admin3 code
-    13:"admin4_code", # admin4 code
-    14:"population", # population
-    15:"elevation", # elevation
-    16:"dem", # dem (digital elevation model)
-    17:"timezone", # timezone
-    18:"modification_date" # modification date yyyy-MM-dd
-    }
-    data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
-    data.rename(columns=rename_cols,inplace=True)
-    return data
-def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False):
-  # Plot training & validation loss values
-  plt.gcf()
-  plt.gca()
-  plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data")
-  if validation:
-    plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data")
-  plt.title('Layer {0} accuracy'.format(output_layer_name))
-  plt.ylabel('Accuracy')
-  plt.xlabel('Epoch')
-  plt.ylim((0,1.1)) #1.1 if accuracy = 1
-  plt.legend()
-  plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,))
-  if show :
-    plt.show()
-def save_embedding(model,tokenizer,layer_idx,fn):
-    embedding_matrix = model.get_weights()[0]
-    with open(os.path.join(fn), 'w') as f:
-        for word, i in tokenizer.word_index.items(): 
-            f.write(word)
-            for i in embedding_matrix[i]: f.write(' ' + repr(i))
-            f.write('\n')
-class Chronometer():
-    def __init__(self):
-        self.__task_begin_timestamp = {}
-    def start(self,task_name):
-        """
-        Start a new task chronometer
-        Parameters
-        ----------
-        task_name : str
-            task id
-        Raises
-        ------
-        ValueError
-            if a running task already exists with that name
-        """
-        if task_name in self.__task_begin_timestamp:
-            raise ValueError("A running task exists with the name {0}!".format(task_name))
-        self.__task_begin_timestamp[task_name] = time.time()
-    def stop(self,task_name):
-        """
-        Stop and return the duration of the task
-        Parameters
-        ----------
-        task_name : str
-            task id
-        Returns
-        -------
-        float
-            duration of the task in seconds
-        Raises
-        ------
-        ValueError
-            if no task exist with the id `task_name`
-        """
-        if not task_name in self.__task_begin_timestamp:
-             raise ValueError("The {0} task does not exist!".format(task_name))
-        duration = time.time() - self.__task_begin_timestamp[task_name]
-        del self.__task_begin_timestamp[task_name]
-        return duration
-if __name__ == "__main__":
-    chrono = Chronometer()
-    chrono.start("test")
-    chrono.start("test2")
-    time.sleep(3)
-    print(chrono.stop("test"))
-    time.sleep(3)
-    print(chrono.stop("test2"))
\ No newline at end of file
--- a/metrics.py
+++ b/metrics.py
+import tensorflow as tf
+def lat_accuracy(LAT_TOL =1/180.):
+    def accuracy_at_k_lat(y_true, y_pred):
+        """
+        Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+        task for neural network to obtain the exact  coordinate.
+        Parameters
+        ----------
+        y_true : tf.Tensor
+            truth data
+        y_pred : tf.Tensor
+            predicted output
+        """
+        diff = tf.abs(y_true - y_pred)
+        fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
+        return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
+    return accuracy_at_k_lat
+def lon_accuracy(LON_TOL=1/360.):
+    def accuracy_at_k_lon(y_true, y_pred):
+        """
+        Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+        task for neural network to obtain the exact  coordinate.
+        Parameters
+        ----------
+        y_true : tf.Tensor
+            truth data
+        y_pred : tf.Tensor
+            predicted output
+        """
+        diff = tf.abs(y_true - y_pred)
+        fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
+        return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
+    return accuracy_at_k_lon
\ No newline at end of file
--- a/predict_toponym_coordinates.py
+++ b/predict_toponym_coordinates.py
@@ -3,33 +3,14 @@ import tensorflow as tf
 import keras.backend as K
 from utils import NgramIndex
-from flask import Flask
-ACCURACY_TOLERANCE = 0.002
-def accuracy_at_k(y_true, y_pred):
-    """
-    Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
-    task for neural network to obtain the exact coordinate.
-    Parameters
-    ----------
-    y_true : tf.Tensor
-        truth data
-    y_pred : tf.Tensor
-        predicted output
-    """
-    global ACCURACY_TOLERANCE
-    diff = tf.abs(y_true - y_pred)
-    fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
-    return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
 from tensorflow.python.keras.backend import set_session
 from tensorflow.python.keras.models import load_model
 sess = None
 graph = None
+from metrics import lat_accuracy,lon_accuracy
 class Geocoder(object):
    """
    >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt")
@@ -42,10 +23,10 @@ class Geocoder(object):
    def __init__(self,keras_model_fn,ngram_index_file):
        global sess
        global graph
-        sess = tf.Session()
+        sess = tf.compat.v1.Session()
-        graph = tf.get_default_graph()
+        graph = tf.compat.v1.get_default_graph()
        set_session(sess)
-        self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k":accuracy_at_k})
+        self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy})
        self.ngram_encoder = NgramIndex.load(ngram_index_file)
    def get_coord(self,toponym,context_toponym):
@@ -80,29 +61,20 @@ class Geocoder(object):
            ax.plot(lon,lat,marker='o', color='red', markersize=5)
            plt.show()
+if __name__ == "__main__":
+    from flask import Flask, escape, request, render_template
-"""geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
+    app = Flask(__name__)
-lon,lat = geocoder.get_coord("Paris","New-York")
-lon,lat = geocoder.wgs_coord(lon,lat)
-geocoder.plot_coord("Paris,New-York",lat,lon,interactive_map=True)"""
-from flask import Flask, escape, request, render_template
-app = Flask(__name__)
-# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! 
-# Otherwise, their weights will be unavailable in the threads after the session there has been set
-geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
+    geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
-@app.route('/',methods=["GET"])
+    @app.route('/',methods=["GET"])
-def display():
+    def display():
-    toponym = request.args.get("top", "Paris")
+        toponym = request.args.get("top", "Paris")
-    c_toponym = request.args.get("c_top", "Cherbourg")
+        c_toponym = request.args.get("c_top", "Cherbourg")
-    lon,lat = geocoder.get_coord(toponym,c_toponym)
+        lon,lat = geocoder.get_coord(toponym,c_toponym)
-    lon,lat = geocoder.wgs_coord(lon,lat)
+        lon,lat = geocoder.wgs_coord(lon,lat)
-    return  render_template("skeleton.html",lat=lat,lon=lon)
+        return  render_template("skeleton.html",lat=lat,lon=lon)
-app.run(host='0.0.0.0')
+    app.run(host='0.0.0.0')
\ No newline at end of file