Skip to content
Snippets Groups Projects
Commit 5bb71e79 authored by Jacques Fize's avatar Jacques Fize
Browse files

DEBUG + MOVE GEONAME EMBEDDING TO ANOTHER REPO

parent 486a0f91
No related branches found
No related tags found
No related merge requests found
...@@ -146,4 +146,6 @@ test_comb.sh ...@@ -146,4 +146,6 @@ test_comb.sh
notes.md notes.md
.idea/* .idea/*
.vscode/* .vscode/*
\ No newline at end of file other/*
test*
\ No newline at end of file
...@@ -21,6 +21,7 @@ from shapely.geometry import Point ...@@ -21,6 +21,7 @@ from shapely.geometry import Point
from helpers import read_geonames from helpers import read_geonames
from utils import Grid from utils import Grid
from utils import zero_one_encoding, NgramIndex,ConfigurationReader from utils import zero_one_encoding, NgramIndex,ConfigurationReader
from metrics import lat_accuracy,lon_accuracy
# Logging # Logging
from tqdm import tqdm from tqdm import tqdm
...@@ -73,41 +74,7 @@ def get_new_ids(cooc_data,id_first_value): ...@@ -73,41 +74,7 @@ def get_new_ids(cooc_data,id_first_value):
topo_id[id_]=interlink topo_id[id_]=interlink
return topo_id return topo_id
def lat_accuracy(LAT_TOL =1/180.):
def accuracy_at_k_lat(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lat
def lon_accuracy(LON_TOL=1/360.):
def accuracy_at_k_lon(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lon
# LOGGING CONF # LOGGING CONF
logging.basicConfig( logging.basicConfig(
...@@ -118,7 +85,7 @@ logging.basicConfig( ...@@ -118,7 +85,7 @@ logging.basicConfig(
chrono = Chronometer() chrono = Chronometer()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
.parse_args("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) .parse_args()#("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
# Initialisee CONSTANTS # Initialisee CONSTANTS
GEONAME_FN = args.geoname_input GEONAME_FN = args.geoname_input
......
# PYTHON MODULE
import math
import random
from argparse import ArgumentParser
from multiprocessing import cpu_count
from argparse import RawTextHelpFormatter
# COMMON DATA STRUCTURE MODULE
import numpy as np
import networkx as nx
# SPATIAL DATA MANIPULATION
import geopandas as gpd
import osrm
osrm.RequestConfig.host = "jacquesfize.com:5000"
from shapely.geometry import Point
# Machine Learning MODULE
from node2vec import Node2Vec
# VISUALISATION MODULE
from tqdm import tqdm
tqdm.pandas()
# PERSONAL FUNCTION
from helpers import *
parser = ArgumentParser(description='Generate a spatial embedding of places using Geonames data', formatter_class=RawTextHelpFormatter)
parser.add_argument("input")
parser.add_argument("--nbcpu",type=int,default=cpu_count())
parser.add_argument("--vector-size",type=int,default=64,help="Output Vector Dimension")
parser.add_argument("--walk-length",type=int,default=30, help="Size of the walk generated during the Node2vec algorithm")
parser.add_argument("--num-walks",type=int,default=200, help="Number of walk generated during the Node2vec algorithm")
parser.add_argument("--word2vec-window-size",type=int,default=30, help="Window size used in the Word2vec algorithm")
parser.add_argument("--buffer-size",type=float,default=0.03,help="Buffer size to transform Point in Polygon. Used for adjacency matrix computation.")
parser.add_argument("-d",action="store_true",help="Integrate the distance weight between vertices")
parser.add_argument("--dist",choices=["euclidean","itinerary"],default="itinerary",help="""Two distance functions are available:
- Euclidean : Euclidean distance between the two places centroids
- Itinerary : Compute the itinerary distance between two places using an OSRM service
""")
parser.add_argument("--noise",action="store_true")
parser.add_argument("--noise-size",type=int,default=500)
args = parser.parse_args()
# INPUT DATA
GEONAMES_FN = args.input
# PARALLELISM OPTION
NUMBER_OF_CPU_USED = args.nbcpu
# Graph Embedding parameter
VECTOR_SIZE = args.vector_size
WALK_LENGTH = args.walk_length
NUMBER_OF_WALK = args.num_walks
WORD2VEC_WINDOW = args.word2vec_window_size
# GRAPH WEIGHT PARAMETER
IS_DISTANCE = args.d
DISTANCE = args.dist
# if simulation of new toponyms
GEO_DISTANCE_COEF = 0.5
EMBEDDING_DISTANCE_COEF = 0.5
# New toponym simulation
IS_NOISE = args.noise
NUMBER_OF_NODE_DESPATIALIZED = args.noise_size
# DISTANCE CACHE STORAGE
from sqlitedict import SqliteDict
distance_dict = SqliteDict('./data/distance_dict.sqlite', autocommit=True)
# LOAD GEONAMES DATA
df = read_geonames(GEONAMES_FN)
df["geometry"] = df["latitude longitude".split()].progress_apply(lambda x:Point(x.longitude,x.latitude),axis=1)
# Create GeoDataFrame for faster spatial comparison operations
gdf = gpd.GeoDataFrame(df)
# Select a sample that concerns the departement "La Manche"
manche_gdf = gdf[gdf.admin2_code == "50"]
manche_gdf["geometry"]=manche_gdf.geometry.buffer(0.03)
manche_gdf.plot()
# plt.show()
# Build a Adjacency matrix to generate the graph used for the embedding generation
N = len(manche_gdf)
adjacency_matrix = np.zeros((N,N))
geometries = manche_gdf.geometry.values
for i in tqdm(range(N)):
for j in range(i,N):
adjacency_matrix[i,j] = geometries[i].intersects(geometries[j])
adjacency_matrix[j,i] = adjacency_matrix[i,j]
plt.clf()
plt.imshow(adjacency_matrix);plt.colorbar()
# plt.show()
# Mapping id between matrix and geonameid
manche_gdf["code_matrix"]=np.arange(N)
geoname_id2idxmat = dict(manche_gdf["geonameid code_matrix".split()].values)
idxmat2geoname_id = {v:k for k,v in geoname_id2idxmat.items()}
# Add adjacent entity found in the Geodataframe
def get_adjacent_entity(x):
idxs = np.nonzero(adjacency_matrix[geoname_id2idxmat[x]])[0]
return [idxmat2geoname_id[idx] for idx in idxs if not idxmat2geoname_id[idx] == x] # take not itself
manche_gdf["adjacent_entity"]=manche_gdf.geonameid.apply(get_adjacent_entity)
# Code for getting the distance using the road network (not euclidean) PART 1
manche_gdf["geometry_centroid"]=manche_gdf.centroid
coords = dict(manche_gdf["geonameid geometry_centroid".split()].values)
# Code for getting the distance using the road network (not euclidean) PART 2
# Run ORSM SERVER
#https://hub.docker.com/r/osrm/osrm-backend/
#docker run -t -p 5000:5000 -v $(pwd):/data osrm/osrm-backend osrm-extract -p /opt/car.lua /data/road.pbf
#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/road.osrm
#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/road.osrm
#docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/road.osrm
# Check Also : https://github.com/ustroetz/python-osrm#route
# Test: curl 'http://<yourserver>:5000/route/v1/driving/49.38,-1.37;49,-1.37?steps=true'
def getTupCoords(id_):
return [coords[id_].x,coords[id_].y]
def getDistance(id_1,id_2):
try:
return osrm.simple_route(getTupCoords(id_1), getTupCoords(id_2), output="route", overview="full",steps=False,geometry="wkt")[0]["distance"]
except IndexError:
return -1
def signature(id_1,id2):
return "_".join([str(id_)for id_ in sorted([id_1,id2])])
def getDistanceSDict(id_1,id_2,sqlite_dict):
hash_ = signature(id_1,id_2)
if not hash_ in sqlite_dict:
sqlite_dict[hash_]=getDistance(id_1,id_2)
return sqlite_dict[hash_]
from joblib import Parallel,delayed # for parallel job computation
def job(G,row,adjacent):
new_edge = (row.geonameid,adjacent)
if not G.has_edge(*new_edge):
if IS_DISTANCE and DISTANCE == "itinerary":
return (*new_edge,getDistanceSDict(new_edge[0],new_edge[1],distance_dict))
elif IS_DISTANCE and DISTANCE == "euclidean":
raise NotImplementedError()
else:
return 1
# Using Route Distance
G = nx.Graph()
for ix,row in tqdm(manche_gdf["geonameid adjacent_entity".split()].iterrows(),total=len(manche_gdf)):
new_edges = Parallel(n_jobs=4,backend="threading")(delayed(job)(G,row,adjacent) for adjacent in row.adjacent_entity)
for edge in new_edges:
if edge:
G.add_edge(edge[0],edge[1],weight=edge[2])
# Data for graph projection
lon_dict= dict(manche_gdf["geonameid longitude".split()].values)
lat_dict= dict(manche_gdf["geonameid latitude".split()].values)
pos= {n:[lon_dict[n],lat_dict[n]]for n in G.nodes()}
nx.draw(G,pos=pos,node_size=1)
# plt.show()
for ed in list(G.edges()):
G[ed[0]][ed[1]]["weight"]+=1 # problem when G[ed[0]][ed[1]]["weight"]==0:
if IS_NOISE:
H = G.copy()
edges,weights = zip(*nx.get_edge_attributes(H,'weight').items())
sample = random.sample(list(H.nodes()),NUMBER_OF_NODE_DESPATIALIZED)
H.remove_nodes_from(sample)
pos= {n:[lon_dict[n],lat_dict[n]] for n in H.nodes()}
nx.draw(H,pos=pos,node_size=1)
# plt.show()
label_dict = dict(manche_gdf["geonameid name".split()].values)
embeddings = dict(pd.read_msgpack("data/embeddings/geonamesFRWithEmbeddings.msg")["geonameid embedding".split()].values)
ids,emb = zip(*embeddings.items())
id2geonameid = dict(enumerate(ids))
geonameid2id = {id_:ix for ix, id_ in enumerate(ids) }
emb_matrix = np.asarray(emb)
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(emb)
top_n = np.argsort(sim_matrix)[:,-3:]
for ix,n in enumerate(sample):
top_i = [id2geonameid[top] for top in top_n[geonameid2id[n]]]
weights_i = [sim_matrix[geonameid2id[n]][top]for top in top_n[geonameid2id[n]]]
for ij,top_ij in enumerate(top_i):
H.add_edge(n,top_ij,weight=weights_i[ij])
G = H.copy()
node2vec = Node2Vec(G, dimensions=VECTOR_SIZE, walk_length=WALK_LENGTH, num_walks=NUMBER_OF_WALK, workers=NUMBER_OF_CPU_USED,temp_folder="temp") # Use temp_folder for big graphs
model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_CPU_USED)
# Saving the embedding model
if not IS_NOISE:
model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
walk_l = WALK_LENGTH,
num_walk = NUMBER_OF_WALK,
window = WORD2VEC_WINDOW,filename=GEONAMES_FN.split("/")[-1] ))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
else:
model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
walk_l = WALK_LENGTH,
num_walk = NUMBER_OF_WALK,
window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED,filename=GEONAMES_FN.split("/")[-1]))
\ No newline at end of file
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
def read_geonames(file):
"""
Return a dataframe that contains Geonames data.
Parameters
----------
file : str
path of the Geonames Csv file
Returns
-------
pd.DataFrame
geonames data
"""
dtypes_dict = {
0: int, # geonameid
1: str, # name
2: str, # asciiname
3: str, # alternatenames
4: float, # latitude
5: float, # longitude
6: str, # feature class
7: str, # feature code
8: str, # country code
9: str, # cc2
10: str, # admin1 code
11: str, # admin2 code
12: str, # admin3 code
13: str, # admin4 code
14: int, # population
15: str, # elevation
16: int, # dem (digital elevation model)
17: str, # timezone
18: str # modification date yyyy-MM-dd
}
rename_cols = {
0:"geonameid", # geonameid
1:"name", # name
2:"asciiname", # asciiname
3:"alternatenames", # alternatenames
4:"latitude", # latitude
5:"longitude", # longitude
6:"feature_class", # feature class
7:"feature_code", # feature code
8:"country_code", # country code
9:"cc2", # cc2
10:"admin1_code", # admin1 code
11:"admin2_code", # admin2 code
12:"admin3_code", # admin3 code
13:"admin4_code", # admin4 code
14:"population", # population
15:"elevation", # elevation
16:"dem", # dem (digital elevation model)
17:"timezone", # timezone
18:"modification_date" # modification date yyyy-MM-dd
}
data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
data.rename(columns=rename_cols,inplace=True)
return data
def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False):
# Plot training & validation loss values
plt.gcf()
plt.gca()
plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data")
if validation:
plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data")
plt.title('Layer {0} accuracy'.format(output_layer_name))
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.ylim((0,1.1)) #1.1 if accuracy = 1
plt.legend()
plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,))
if show :
plt.show()
def save_embedding(model,tokenizer,layer_idx,fn):
embedding_matrix = model.get_weights()[0]
with open(os.path.join(fn), 'w') as f:
for word, i in tokenizer.word_index.items():
f.write(word)
for i in embedding_matrix[i]: f.write(' ' + repr(i))
f.write('\n')
class Chronometer():
def __init__(self):
self.__task_begin_timestamp = {}
def start(self,task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError("A running task exists with the name {0}!".format(task_name))
self.__task_begin_timestamp[task_name] = time.time()
def stop(self,task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
if __name__ == "__main__":
chrono = Chronometer()
chrono.start("test")
chrono.start("test2")
time.sleep(3)
print(chrono.stop("test"))
time.sleep(3)
print(chrono.stop("test2"))
\ No newline at end of file
import tensorflow as tf
def lat_accuracy(LAT_TOL =1/180.):
def accuracy_at_k_lat(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lat
def lon_accuracy(LON_TOL=1/360.):
def accuracy_at_k_lon(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lon
\ No newline at end of file
...@@ -3,33 +3,14 @@ import tensorflow as tf ...@@ -3,33 +3,14 @@ import tensorflow as tf
import keras.backend as K import keras.backend as K
from utils import NgramIndex from utils import NgramIndex
from flask import Flask
ACCURACY_TOLERANCE = 0.002
def accuracy_at_k(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
global ACCURACY_TOLERANCE
diff = tf.abs(y_true - y_pred)
fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model from tensorflow.python.keras.models import load_model
sess = None sess = None
graph = None graph = None
from metrics import lat_accuracy,lon_accuracy
class Geocoder(object): class Geocoder(object):
""" """
>>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt")
...@@ -42,10 +23,10 @@ class Geocoder(object): ...@@ -42,10 +23,10 @@ class Geocoder(object):
def __init__(self,keras_model_fn,ngram_index_file): def __init__(self,keras_model_fn,ngram_index_file):
global sess global sess
global graph global graph
sess = tf.Session() sess = tf.compat.v1.Session()
graph = tf.get_default_graph() graph = tf.compat.v1.get_default_graph()
set_session(sess) set_session(sess)
self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k":accuracy_at_k}) self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy})
self.ngram_encoder = NgramIndex.load(ngram_index_file) self.ngram_encoder = NgramIndex.load(ngram_index_file)
def get_coord(self,toponym,context_toponym): def get_coord(self,toponym,context_toponym):
...@@ -80,29 +61,20 @@ class Geocoder(object): ...@@ -80,29 +61,20 @@ class Geocoder(object):
ax.plot(lon,lat,marker='o', color='red', markersize=5) ax.plot(lon,lat,marker='o', color='red', markersize=5)
plt.show() plt.show()
if __name__ == "__main__":
from flask import Flask, escape, request, render_template
"""geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") app = Flask(__name__)
lon,lat = geocoder.get_coord("Paris","New-York")
lon,lat = geocoder.wgs_coord(lon,lat)
geocoder.plot_coord("Paris,New-York",lat,lon,interactive_map=True)"""
from flask import Flask, escape, request, render_template
app = Flask(__name__)
# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras!
# Otherwise, their weights will be unavailable in the threads after the session there has been set
geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
@app.route('/',methods=["GET"]) @app.route('/',methods=["GET"])
def display(): def display():
toponym = request.args.get("top", "Paris") toponym = request.args.get("top", "Paris")
c_toponym = request.args.get("c_top", "Cherbourg") c_toponym = request.args.get("c_top", "Cherbourg")
lon,lat = geocoder.get_coord(toponym,c_toponym) lon,lat = geocoder.get_coord(toponym,c_toponym)
lon,lat = geocoder.wgs_coord(lon,lat) lon,lat = geocoder.wgs_coord(lon,lat)
return render_template("skeleton.html",lat=lat,lon=lon) return render_template("skeleton.html",lat=lat,lon=lon)
app.run(host='0.0.0.0') app.run(host='0.0.0.0')
\ No newline at end of file \ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment