diff --git a/.gitignore b/.gitignore index 589ecbd98650129ffab8c9cb9447c47966228249..ddc1507dda5eebc7fbb67e9a3546f78022969b26 100755 --- a/.gitignore +++ b/.gitignore @@ -146,4 +146,6 @@ test_comb.sh notes.md .idea/* -.vscode/* \ No newline at end of file +.vscode/* +other/* +test* \ No newline at end of file diff --git a/combination_embeddings.py b/combination_embeddings.py index ab4a251956e774cd6f7d7a53ad24268711661af0..c147f8064042bea90976c809d7ca42fe5012369e 100755 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -21,6 +21,7 @@ from shapely.geometry import Point from helpers import read_geonames from utils import Grid from utils import zero_one_encoding, NgramIndex,ConfigurationReader +from metrics import lat_accuracy,lon_accuracy # Logging from tqdm import tqdm @@ -73,41 +74,7 @@ def get_new_ids(cooc_data,id_first_value): topo_id[id_]=interlink return topo_id -def lat_accuracy(LAT_TOL =1/180.): - def accuracy_at_k_lat(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64) - return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) - return accuracy_at_k_lat - -def lon_accuracy(LON_TOL=1/360.): - def accuracy_at_k_lon(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64) - return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) - return accuracy_at_k_lon + # LOGGING CONF logging.basicConfig( @@ -118,7 +85,7 @@ logging.basicConfig( chrono = Chronometer() args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) + .parse_args()#("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) # Initialisee CONSTANTS GEONAME_FN = args.geoname_input diff --git a/geonames_embedding.py b/geonames_embedding.py deleted file mode 100755 index 218534998e398ce671904a123aea2e8b8bff7a31..0000000000000000000000000000000000000000 --- a/geonames_embedding.py +++ /dev/null @@ -1,221 +0,0 @@ -# PYTHON MODULE -import math -import random -from argparse import ArgumentParser -from multiprocessing import cpu_count -from argparse import RawTextHelpFormatter - -# COMMON DATA STRUCTURE MODULE -import numpy as np -import networkx as nx - -# SPATIAL DATA MANIPULATION -import geopandas as gpd -import osrm -osrm.RequestConfig.host = "jacquesfize.com:5000" -from shapely.geometry import Point - -# Machine Learning MODULE -from node2vec import Node2Vec - -# VISUALISATION MODULE -from tqdm import tqdm -tqdm.pandas() - -# PERSONAL FUNCTION -from helpers import * - -parser = ArgumentParser(description='Generate a spatial embedding of places using Geonames data', formatter_class=RawTextHelpFormatter) - -parser.add_argument("input") - -parser.add_argument("--nbcpu",type=int,default=cpu_count()) - -parser.add_argument("--vector-size",type=int,default=64,help="Output Vector Dimension") -parser.add_argument("--walk-length",type=int,default=30, help="Size of the walk generated during the Node2vec algorithm") -parser.add_argument("--num-walks",type=int,default=200, help="Number of walk generated during the Node2vec algorithm") -parser.add_argument("--word2vec-window-size",type=int,default=30, help="Window size used in the Word2vec algorithm") - -parser.add_argument("--buffer-size",type=float,default=0.03,help="Buffer size to transform Point in Polygon. Used for adjacency matrix computation.") -parser.add_argument("-d",action="store_true",help="Integrate the distance weight between vertices") -parser.add_argument("--dist",choices=["euclidean","itinerary"],default="itinerary",help="""Two distance functions are available: - - Euclidean : Euclidean distance between the two places centroids - - Itinerary : Compute the itinerary distance between two places using an OSRM service -""") - -parser.add_argument("--noise",action="store_true") -parser.add_argument("--noise-size",type=int,default=500) - -args = parser.parse_args() - -# INPUT DATA -GEONAMES_FN = args.input - -# PARALLELISM OPTION -NUMBER_OF_CPU_USED = args.nbcpu - -# Graph Embedding parameter -VECTOR_SIZE = args.vector_size -WALK_LENGTH = args.walk_length -NUMBER_OF_WALK = args.num_walks -WORD2VEC_WINDOW = args.word2vec_window_size - -# GRAPH WEIGHT PARAMETER -IS_DISTANCE = args.d -DISTANCE = args.dist -# if simulation of new toponyms -GEO_DISTANCE_COEF = 0.5 -EMBEDDING_DISTANCE_COEF = 0.5 - -# New toponym simulation -IS_NOISE = args.noise -NUMBER_OF_NODE_DESPATIALIZED = args.noise_size - - -# DISTANCE CACHE STORAGE -from sqlitedict import SqliteDict -distance_dict = SqliteDict('./data/distance_dict.sqlite', autocommit=True) - -# LOAD GEONAMES DATA -df = read_geonames(GEONAMES_FN) -df["geometry"] = df["latitude longitude".split()].progress_apply(lambda x:Point(x.longitude,x.latitude),axis=1) - -# Create GeoDataFrame for faster spatial comparison operations -gdf = gpd.GeoDataFrame(df) - -# Select a sample that concerns the departement "La Manche" -manche_gdf = gdf[gdf.admin2_code == "50"] -manche_gdf["geometry"]=manche_gdf.geometry.buffer(0.03) -manche_gdf.plot() -# plt.show() - -# Build a Adjacency matrix to generate the graph used for the embedding generation -N = len(manche_gdf) -adjacency_matrix = np.zeros((N,N)) -geometries = manche_gdf.geometry.values -for i in tqdm(range(N)): - for j in range(i,N): - adjacency_matrix[i,j] = geometries[i].intersects(geometries[j]) - adjacency_matrix[j,i] = adjacency_matrix[i,j] -plt.clf() -plt.imshow(adjacency_matrix);plt.colorbar() -# plt.show() - -# Mapping id between matrix and geonameid -manche_gdf["code_matrix"]=np.arange(N) -geoname_id2idxmat = dict(manche_gdf["geonameid code_matrix".split()].values) -idxmat2geoname_id = {v:k for k,v in geoname_id2idxmat.items()} - -# Add adjacent entity found in the Geodataframe -def get_adjacent_entity(x): - idxs = np.nonzero(adjacency_matrix[geoname_id2idxmat[x]])[0] - return [idxmat2geoname_id[idx] for idx in idxs if not idxmat2geoname_id[idx] == x] # take not itself - -manche_gdf["adjacent_entity"]=manche_gdf.geonameid.apply(get_adjacent_entity) - -# Code for getting the distance using the road network (not euclidean) PART 1 -manche_gdf["geometry_centroid"]=manche_gdf.centroid -coords = dict(manche_gdf["geonameid geometry_centroid".split()].values) - -# Code for getting the distance using the road network (not euclidean) PART 2 - -# Run ORSM SERVER -#https://hub.docker.com/r/osrm/osrm-backend/ -#docker run -t -p 5000:5000 -v $(pwd):/data osrm/osrm-backend osrm-extract -p /opt/car.lua /data/road.pbf -#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/road.osrm -#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/road.osrm -#docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/road.osrm -# Check Also : https://github.com/ustroetz/python-osrm#route - -# Test: curl 'http://<yourserver>:5000/route/v1/driving/49.38,-1.37;49,-1.37?steps=true' -def getTupCoords(id_): - return [coords[id_].x,coords[id_].y] - -def getDistance(id_1,id_2): - try: - return osrm.simple_route(getTupCoords(id_1), getTupCoords(id_2), output="route", overview="full",steps=False,geometry="wkt")[0]["distance"] - except IndexError: - return -1 - -def signature(id_1,id2): - return "_".join([str(id_)for id_ in sorted([id_1,id2])]) - -def getDistanceSDict(id_1,id_2,sqlite_dict): - hash_ = signature(id_1,id_2) - if not hash_ in sqlite_dict: - sqlite_dict[hash_]=getDistance(id_1,id_2) - return sqlite_dict[hash_] -from joblib import Parallel,delayed # for parallel job computation - -def job(G,row,adjacent): - new_edge = (row.geonameid,adjacent) - if not G.has_edge(*new_edge): - if IS_DISTANCE and DISTANCE == "itinerary": - return (*new_edge,getDistanceSDict(new_edge[0],new_edge[1],distance_dict)) - elif IS_DISTANCE and DISTANCE == "euclidean": - raise NotImplementedError() - else: - return 1 -# Using Route Distance -G = nx.Graph() -for ix,row in tqdm(manche_gdf["geonameid adjacent_entity".split()].iterrows(),total=len(manche_gdf)): - new_edges = Parallel(n_jobs=4,backend="threading")(delayed(job)(G,row,adjacent) for adjacent in row.adjacent_entity) - for edge in new_edges: - if edge: - G.add_edge(edge[0],edge[1],weight=edge[2]) - -# Data for graph projection -lon_dict= dict(manche_gdf["geonameid longitude".split()].values) -lat_dict= dict(manche_gdf["geonameid latitude".split()].values) -pos= {n:[lon_dict[n],lat_dict[n]]for n in G.nodes()} - -nx.draw(G,pos=pos,node_size=1) -# plt.show() - -for ed in list(G.edges()): - G[ed[0]][ed[1]]["weight"]+=1 # problem when G[ed[0]][ed[1]]["weight"]==0: - -if IS_NOISE: - H = G.copy() - edges,weights = zip(*nx.get_edge_attributes(H,'weight').items()) - sample = random.sample(list(H.nodes()),NUMBER_OF_NODE_DESPATIALIZED) - H.remove_nodes_from(sample) - - pos= {n:[lon_dict[n],lat_dict[n]] for n in H.nodes()} - nx.draw(H,pos=pos,node_size=1) - # plt.show() - - label_dict = dict(manche_gdf["geonameid name".split()].values) - embeddings = dict(pd.read_msgpack("data/embeddings/geonamesFRWithEmbeddings.msg")["geonameid embedding".split()].values) - - ids,emb = zip(*embeddings.items()) - id2geonameid = dict(enumerate(ids)) - geonameid2id = {id_:ix for ix, id_ in enumerate(ids) } - emb_matrix = np.asarray(emb) - - from sklearn.metrics.pairwise import cosine_similarity - sim_matrix = cosine_similarity(emb) - top_n = np.argsort(sim_matrix)[:,-3:] - - for ix,n in enumerate(sample): - top_i = [id2geonameid[top] for top in top_n[geonameid2id[n]]] - weights_i = [sim_matrix[geonameid2id[n]][top]for top in top_n[geonameid2id[n]]] - for ij,top_ij in enumerate(top_i): - H.add_edge(n,top_ij,weight=weights_i[ij]) - -G = H.copy() - -node2vec = Node2Vec(G, dimensions=VECTOR_SIZE, walk_length=WALK_LENGTH, num_walks=NUMBER_OF_WALK, workers=NUMBER_OF_CPU_USED,temp_folder="temp") # Use temp_folder for big graphs -model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_CPU_USED) - -# Saving the embedding model -if not IS_NOISE: - model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE, - walk_l = WALK_LENGTH, - num_walk = NUMBER_OF_WALK, - window = WORD2VEC_WINDOW,filename=GEONAMES_FN.split("/")[-1] ))#,noise = NUMBER_OF_NODE_DESPATIALIZED)) -else: - model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE, - walk_l = WALK_LENGTH, - num_walk = NUMBER_OF_WALK, - window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED,filename=GEONAMES_FN.split("/")[-1])) \ No newline at end of file diff --git a/helpers.py b/helpers.py deleted file mode 100755 index 825dd4a6d522be2770618fe78891d55d745df298..0000000000000000000000000000000000000000 --- a/helpers.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import time - -import pandas as pd - -import matplotlib.pyplot as plt - -def read_geonames(file): - """ - Return a dataframe that contains Geonames data. - - Parameters - ---------- - file : str - path of the Geonames Csv file - - Returns - ------- - pd.DataFrame - geonames data - """ - dtypes_dict = { - 0: int, # geonameid - 1: str, # name - 2: str, # asciiname - 3: str, # alternatenames - 4: float, # latitude - 5: float, # longitude - 6: str, # feature class - 7: str, # feature code - 8: str, # country code - 9: str, # cc2 - 10: str, # admin1 code - 11: str, # admin2 code - 12: str, # admin3 code - 13: str, # admin4 code - 14: int, # population - 15: str, # elevation - 16: int, # dem (digital elevation model) - 17: str, # timezone - 18: str # modification date yyyy-MM-dd - } - rename_cols = { - 0:"geonameid", # geonameid - 1:"name", # name - 2:"asciiname", # asciiname - 3:"alternatenames", # alternatenames - 4:"latitude", # latitude - 5:"longitude", # longitude - 6:"feature_class", # feature class - 7:"feature_code", # feature code - 8:"country_code", # country code - 9:"cc2", # cc2 - 10:"admin1_code", # admin1 code - 11:"admin2_code", # admin2 code - 12:"admin3_code", # admin3 code - 13:"admin4_code", # admin4 code - 14:"population", # population - 15:"elevation", # elevation - 16:"dem", # dem (digital elevation model) - 17:"timezone", # timezone - 18:"modification_date" # modification date yyyy-MM-dd - } - data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) - data.rename(columns=rename_cols,inplace=True) - return data - -def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False): - # Plot training & validation loss values - plt.gcf() - plt.gca() - plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data") - if validation: - plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data") - plt.title('Layer {0} accuracy'.format(output_layer_name)) - plt.ylabel('Accuracy') - plt.xlabel('Epoch') - plt.ylim((0,1.1)) #1.1 if accuracy = 1 - plt.legend() - plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,)) - if show : - plt.show() - - -def save_embedding(model,tokenizer,layer_idx,fn): - embedding_matrix = model.get_weights()[0] - with open(os.path.join(fn), 'w') as f: - for word, i in tokenizer.word_index.items(): - f.write(word) - for i in embedding_matrix[i]: f.write(' ' + repr(i)) - f.write('\n') - - - -class Chronometer(): - def __init__(self): - self.__task_begin_timestamp = {} - - def start(self,task_name): - """ - Start a new task chronometer - - Parameters - ---------- - task_name : str - task id - - Raises - ------ - ValueError - if a running task already exists with that name - """ - if task_name in self.__task_begin_timestamp: - raise ValueError("A running task exists with the name {0}!".format(task_name)) - self.__task_begin_timestamp[task_name] = time.time() - - def stop(self,task_name): - """ - Stop and return the duration of the task - - Parameters - ---------- - task_name : str - task id - - Returns - ------- - float - duration of the task in seconds - - Raises - ------ - ValueError - if no task exist with the id `task_name` - """ - if not task_name in self.__task_begin_timestamp: - raise ValueError("The {0} task does not exist!".format(task_name)) - duration = time.time() - self.__task_begin_timestamp[task_name] - del self.__task_begin_timestamp[task_name] - return duration - -if __name__ == "__main__": - chrono = Chronometer() - chrono.start("test") - chrono.start("test2") - time.sleep(3) - print(chrono.stop("test")) - time.sleep(3) - print(chrono.stop("test2")) \ No newline at end of file diff --git a/metrics.py b/metrics.py new file mode 100755 index 0000000000000000000000000000000000000000..e82c54809aa2a6bece60cd74875140d3719c1ea6 --- /dev/null +++ b/metrics.py @@ -0,0 +1,37 @@ +import tensorflow as tf + +def lat_accuracy(LAT_TOL =1/180.): + def accuracy_at_k_lat(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64) + return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) + return accuracy_at_k_lat + +def lon_accuracy(LON_TOL=1/360.): + def accuracy_at_k_lon(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64) + return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) + return accuracy_at_k_lon \ No newline at end of file diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py index cd836bc800913127cd89ccae5316b9d77acb2d1c..5dcdb7f81a8fbc28826131b5d1680f3647bf6e68 100755 --- a/predict_toponym_coordinates.py +++ b/predict_toponym_coordinates.py @@ -3,33 +3,14 @@ import tensorflow as tf import keras.backend as K from utils import NgramIndex -from flask import Flask - -ACCURACY_TOLERANCE = 0.002 - -def accuracy_at_k(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - global ACCURACY_TOLERANCE - diff = tf.abs(y_true - y_pred) - fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) - return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) - from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.models import load_model sess = None graph = None +from metrics import lat_accuracy,lon_accuracy + class Geocoder(object): """ >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") @@ -42,10 +23,10 @@ class Geocoder(object): def __init__(self,keras_model_fn,ngram_index_file): global sess global graph - sess = tf.Session() - graph = tf.get_default_graph() + sess = tf.compat.v1.Session() + graph = tf.compat.v1.get_default_graph() set_session(sess) - self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k":accuracy_at_k}) + self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy}) self.ngram_encoder = NgramIndex.load(ngram_index_file) def get_coord(self,toponym,context_toponym): @@ -80,29 +61,20 @@ class Geocoder(object): ax.plot(lon,lat,marker='o', color='red', markersize=5) plt.show() +if __name__ == "__main__": + from flask import Flask, escape, request, render_template -"""geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") -lon,lat = geocoder.get_coord("Paris","New-York") -lon,lat = geocoder.wgs_coord(lon,lat) -geocoder.plot_coord("Paris,New-York",lat,lon,interactive_map=True)""" - -from flask import Flask, escape, request, render_template - -app = Flask(__name__) - - + app = Flask(__name__) -# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! -# Otherwise, their weights will be unavailable in the threads after the session there has been set -geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") + geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") -@app.route('/',methods=["GET"]) -def display(): - toponym = request.args.get("top", "Paris") - c_toponym = request.args.get("c_top", "Cherbourg") - lon,lat = geocoder.get_coord(toponym,c_toponym) - lon,lat = geocoder.wgs_coord(lon,lat) - return render_template("skeleton.html",lat=lat,lon=lon) + @app.route('/',methods=["GET"]) + def display(): + toponym = request.args.get("top", "Paris") + c_toponym = request.args.get("c_top", "Cherbourg") + lon,lat = geocoder.get_coord(toponym,c_toponym) + lon,lat = geocoder.wgs_coord(lon,lat) + return render_template("skeleton.html",lat=lat,lon=lon) -app.run(host='0.0.0.0') \ No newline at end of file + app.run(host='0.0.0.0') \ No newline at end of file