From 04d402c0cf7b2df2a727d149729c7606db02b98e Mon Sep 17 00:00:00 2001 From: jfize <jacques.fize@insa-lyon.fr> Date: Thu, 6 Feb 2020 16:06:09 +0100 Subject: [PATCH] DEBUG + ADD DEMO APP --- combination_embeddings.py | 7 +- predict_toponym_coordinates.py | 108 ++++++++++++++++++++++++++++++ requirements.txt | 4 +- scripts/evalgeonamesembeddings.py | 70 +++++++++++++++++++ templates/skeleton.html | 88 ++++++++++++++++++++++++ 5 files changed, 274 insertions(+), 3 deletions(-) create mode 100644 predict_toponym_coordinates.py create mode 100644 scripts/evalgeonamesembeddings.py create mode 100644 templates/skeleton.html diff --git a/combination_embeddings.py b/combination_embeddings.py index 9e2f03f..d2d13a5 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -98,7 +98,7 @@ logging.basicConfig( chrono = Chronometer() args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args("-n 4 -t 0.002 -e 20 -a -w -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) + .parse_args()#("-n 4 -t 0.002 -e 20 -a -w -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) # Initialisee CONSTANTS GEONAME_FN = args.geoname_input @@ -221,7 +221,7 @@ geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #in if args.wikipedia_cooc: geoname2encodedname.update(extension) -index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1])) + logging.info("Done !") #CLEAR RAM @@ -291,6 +291,9 @@ if args.inclusion: if args.wikipedia_cooc: name += "_C" +index.save("outputs/"+name+"_index") + + # NGRAM EMBDEDDING logging.info("Generating N-GRAM Embedding...") embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50) diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py new file mode 100644 index 0000000..cd836bc --- /dev/null +++ b/predict_toponym_coordinates.py @@ -0,0 +1,108 @@ +from keras.models import load_model +import tensorflow as tf +import keras.backend as K +from utils import NgramIndex + +from flask import Flask + +ACCURACY_TOLERANCE = 0.002 + +def accuracy_at_k(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + global ACCURACY_TOLERANCE + diff = tf.abs(y_true - y_pred) + fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) + return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) + +from tensorflow.python.keras.backend import set_session +from tensorflow.python.keras.models import load_model + +sess = None +graph = None + +class Geocoder(object): + """ + >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") + >>>lon,lat = geocoder.get_coord("Paris","New-York") + >>>lon,lat = geocoder.wgs_coord(lon,lat) + >>>geocoder.plot_coord("Paris,New-York",lat,lon) + + if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` + """ + def __init__(self,keras_model_fn,ngram_index_file): + global sess + global graph + sess = tf.Session() + graph = tf.get_default_graph() + set_session(sess) + self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k":accuracy_at_k}) + self.ngram_encoder = NgramIndex.load(ngram_index_file) + + def get_coord(self,toponym,context_toponym): + global sess + global graph + p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) + c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) + with sess.as_default(): + with graph.as_default(): + lon,lat = self.keras_model.predict([[p],[c]]) + return lon[0][0],lat[0][0] + + def wgs_coord(self,lon,lat): + return ((lon*360)-180),((lat*180)-90) + + def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs): + if interactive_map: + import folium + import tempfile + import webbrowser + fp = tempfile.NamedTemporaryFile(delete=False) + m = folium.Map() + folium.Marker([lat, lon], popup=toponym).add_to(m) + m.save(fp.name) + webbrowser.open('file://' + fp.name) + else: + import matplotlib.pyplot as plt + import geopandas + fig, ax = plt.subplots(1,**kwargs) + world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + world.plot(color='white', edgecolor='black',ax=ax) + ax.plot(lon,lat,marker='o', color='red', markersize=5) + plt.show() + + +"""geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") +lon,lat = geocoder.get_coord("Paris","New-York") +lon,lat = geocoder.wgs_coord(lon,lat) +geocoder.plot_coord("Paris,New-York",lat,lon,interactive_map=True)""" + +from flask import Flask, escape, request, render_template + +app = Flask(__name__) + + + +# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! +# Otherwise, their weights will be unavailable in the threads after the session there has been set + +geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") + +@app.route('/',methods=["GET"]) +def display(): + toponym = request.args.get("top", "Paris") + c_toponym = request.args.get("c_top", "Cherbourg") + lon,lat = geocoder.get_coord(toponym,c_toponym) + lon,lat = geocoder.wgs_coord(lon,lat) + return render_template("skeleton.html",lat=lat,lon=lon) + +app.run(host='0.0.0.0') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c5b83fd..798a014 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,6 @@ keras ngram shapely sqlitedict -nltk \ No newline at end of file +nltk +folium +flask diff --git a/scripts/evalgeonamesembeddings.py b/scripts/evalgeonamesembeddings.py new file mode 100644 index 0000000..c7d346d --- /dev/null +++ b/scripts/evalgeonamesembeddings.py @@ -0,0 +1,70 @@ +# Evaluation process +import gensim +import glob +import re +import gensim +import random +from helpers import * +from scipy.spatial.distance import cosine +from shapely.geometry import Point +from scipy.stats.stats import pearsonr + +import pandas as pd +import geopandas as gpd + +from tqdm import tqdm + +NPAIR = 100000 +fns = glob.glob("data/embeddings/*.bin") + +def get_data(fn): + data = [int(x) for x in re.findall("\d+",fn)] + if not len(data) == 4: + return {"embedding_size":data[0], + "walk_length":data[1], + "number_of_walks":data[2], + "word2vec_window_size":data[3], + "filepath":fn, + "noise":data[4] + } + #raise Exception("filename should have 4 integers") + return { + "embedding_size":data[0], + "walk_length":data[1], + "number_of_walks":data[2], + "word2vec_window_size":data[3], + "filepath":fn + } + +df = read_geonames("./data/geonamesData/FR.txt") +df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1) + +# Create GeoDataFrame for faster spatial comparison operations +gdf = gpd.GeoDataFrame(df) + +# Select a sample that concerns the departement "La Manche" +manche_gdf = gdf[gdf.admin2_code == "50"].copy() + +df =pd.DataFrame([get_data(fn) for fn in fns]) + +def get_pearsons(model): + manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid + coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values) + places = list(coords.keys()) + geodesic_d = [] + embeddings_d = [] + for i in tqdm(range(NPAIR),disable=True): + placeA=random.choice(places) + placeB=random.choice(places) + geodesic_d.append(coords[placeA].distance(coords[placeB])) + embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)])) + return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value + +df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0]) +df.fillna(0,inplace=True) +df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno') +plt.show() +df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno') +plt.show() +df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno') +plt.show() \ No newline at end of file diff --git a/templates/skeleton.html b/templates/skeleton.html new file mode 100644 index 0000000..43fe21d --- /dev/null +++ b/templates/skeleton.html @@ -0,0 +1,88 @@ +<!DOCTYPE html> +<html lang="en"> + +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=auto, initial-scale=1.0"> + <meta http-equiv="X-UA-Compatible" content="ie=edge"> + <title>Geocoder Interface</title> + <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" + integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous"> + + <!-- Load Leaflet --> + <link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" + integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" + crossorigin="" /> + <script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" + integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" + crossorigin=""></script> +</head> + +<body> + <style> + body { + + } + + #mapid { + height: 400px; + width: 100%; + } + </style> + + <main class="container-fluid"> + <h1 style="text-align: center;color:white;text-shadow: 1px 1px 2px black;background-color: #999;">Geocoder Demo</h1> + <div id="mapid"></div> + <div class="container" style="background-color: white;padding: 5px;"> + <h2>Input</h2> + <form action="/" method="get"> + <div class="form-group"> + <label for="formGroupExampleInput">Toponym</label> + <input type="text" class="form-control" name="top" + placeholder="Paris"> + </div> + <div class="form-group"> + <label for="formGroupExampleInput2">Context Toponym</label> + <input type="text" class="form-control" name="c_top" + placeholder="Cherbourg"> + </div> + <button type="submit" class="btn btn-primary">Get Coords !</button> + </form> + </div> + </main> + + <!-- JS SCRIPTS --> + <script src="https://code.jquery.com/jquery-3.4.1.slim.min.js" + integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n" + crossorigin="anonymous"></script> + <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js" + integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" + crossorigin="anonymous"></script> + <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js" + integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6" + crossorigin="anonymous"></script> + + <script> + + // Initialize the map + // [50, -0.1] are the latitude and longitude + // 4 is the zoom + // mapid is the id of the div where the map will appear + var mymap = L + .map('mapid') + .setView([50, -0.1], 4); + + // Add a tile to the map = a background. Comes from OpenStreetmap + L.tileLayer( + 'http://tile.stamen.com/toner/{z}/{x}/{y}.png', { + attribution: 'Map data © <a href="https://www.openstreetmap.org/">OpenStreetMap</a>', + maxZoom: 6, + }).addTo(mymap); + + var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); + + + </script> +</body> + +</html> \ No newline at end of file -- GitLab