From 04d402c0cf7b2df2a727d149729c7606db02b98e Mon Sep 17 00:00:00 2001
From: jfize <jacques.fize@insa-lyon.fr>
Date: Thu, 6 Feb 2020 16:06:09 +0100
Subject: [PATCH] DEBUG + ADD DEMO APP

---
 combination_embeddings.py         |   7 +-
 predict_toponym_coordinates.py    | 108 ++++++++++++++++++++++++++++++
 requirements.txt                  |   4 +-
 scripts/evalgeonamesembeddings.py |  70 +++++++++++++++++++
 templates/skeleton.html           |  88 ++++++++++++++++++++++++
 5 files changed, 274 insertions(+), 3 deletions(-)
 create mode 100644 predict_toponym_coordinates.py
 create mode 100644 scripts/evalgeonamesembeddings.py
 create mode 100644 templates/skeleton.html

diff --git a/combination_embeddings.py b/combination_embeddings.py
index 9e2f03f..d2d13a5 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -98,7 +98,7 @@ logging.basicConfig(
 chrono = Chronometer()
 
 args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
-    .parse_args("-n 4 -t 0.002 -e 20 -a -w -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
+    .parse_args()#("-n 4 -t 0.002 -e 20 -a -w -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
 
 # Initialisee CONSTANTS
 GEONAME_FN = args.geoname_input
@@ -221,7 +221,7 @@ geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #in
 if args.wikipedia_cooc:
     geoname2encodedname.update(extension)
 
-index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1]))
+
 logging.info("Done !")
 
 #CLEAR RAM
@@ -291,6 +291,9 @@ if args.inclusion:
 if args.wikipedia_cooc:
     name += "_C"
 
+index.save("outputs/"+name+"_index")
+
+
 # NGRAM EMBDEDDING
 logging.info("Generating N-GRAM Embedding...")
 embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50)
diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py
new file mode 100644
index 0000000..cd836bc
--- /dev/null
+++ b/predict_toponym_coordinates.py
@@ -0,0 +1,108 @@
+from keras.models import load_model
+import tensorflow as tf
+import keras.backend as K
+from utils import NgramIndex
+
+from flask import Flask
+
+ACCURACY_TOLERANCE = 0.002
+
+def accuracy_at_k(y_true, y_pred):
+    """
+    Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+    task for neural network to obtain the exact coordinate.
+
+    Parameters
+    ----------
+    y_true : tf.Tensor
+        truth data
+    y_pred : tf.Tensor
+        predicted output
+    """
+    global ACCURACY_TOLERANCE
+    diff = tf.abs(y_true - y_pred)
+    fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
+    return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
+
+from tensorflow.python.keras.backend import set_session
+from tensorflow.python.keras.models import load_model
+
+sess = None
+graph = None
+
+class Geocoder(object):
+    """
+    >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt")
+    >>>lon,lat = geocoder.get_coord("Paris","New-York")
+    >>>lon,lat = geocoder.wgs_coord(lon,lat)
+    >>>geocoder.plot_coord("Paris,New-York",lat,lon)
+
+    if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()`
+    """
+    def __init__(self,keras_model_fn,ngram_index_file):
+        global sess
+        global graph
+        sess = tf.Session()
+        graph = tf.get_default_graph()
+        set_session(sess)
+        self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k":accuracy_at_k})
+        self.ngram_encoder = NgramIndex.load(ngram_index_file)
+
+    def get_coord(self,toponym,context_toponym):
+        global sess
+        global graph
+        p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len)
+        c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len)
+        with sess.as_default():
+            with graph.as_default():
+                lon,lat = self.keras_model.predict([[p],[c]])
+        return lon[0][0],lat[0][0]
+
+    def wgs_coord(self,lon,lat):
+        return ((lon*360)-180),((lat*180)-90)
+    
+    def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs):
+        if interactive_map:
+            import folium
+            import tempfile
+            import webbrowser
+            fp = tempfile.NamedTemporaryFile(delete=False)
+            m = folium.Map()
+            folium.Marker([lat, lon], popup=toponym).add_to(m)
+            m.save(fp.name)
+            webbrowser.open('file://' + fp.name)
+        else:
+            import matplotlib.pyplot as plt
+            import geopandas
+            fig, ax = plt.subplots(1,**kwargs)
+            world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
+            world.plot(color='white', edgecolor='black',ax=ax)
+            ax.plot(lon,lat,marker='o', color='red', markersize=5)
+            plt.show()
+
+
+"""geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
+lon,lat = geocoder.get_coord("Paris","New-York")
+lon,lat = geocoder.wgs_coord(lon,lat)
+geocoder.plot_coord("Paris,New-York",lat,lon,interactive_map=True)"""
+
+from flask import Flask, escape, request, render_template
+
+app = Flask(__name__)
+
+
+
+# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! 
+# Otherwise, their weights will be unavailable in the threads after the session there has been set
+
+geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
+
+@app.route('/',methods=["GET"])
+def display():
+    toponym = request.args.get("top", "Paris")
+    c_toponym = request.args.get("c_top", "Cherbourg")
+    lon,lat = geocoder.get_coord(toponym,c_toponym)
+    lon,lat = geocoder.wgs_coord(lon,lat)
+    return  render_template("skeleton.html",lat=lat,lon=lon)
+
+app.run(host='0.0.0.0')
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index c5b83fd..798a014 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,6 @@ keras
 ngram
 shapely
 sqlitedict
-nltk
\ No newline at end of file
+nltk
+folium
+flask
diff --git a/scripts/evalgeonamesembeddings.py b/scripts/evalgeonamesembeddings.py
new file mode 100644
index 0000000..c7d346d
--- /dev/null
+++ b/scripts/evalgeonamesembeddings.py
@@ -0,0 +1,70 @@
+# Evaluation process
+import gensim
+import glob
+import re
+import gensim
+import random
+from helpers import *
+from scipy.spatial.distance import cosine
+from shapely.geometry import Point
+from scipy.stats.stats import pearsonr
+
+import pandas as pd
+import geopandas as gpd
+
+from tqdm import tqdm
+
+NPAIR = 100000
+fns = glob.glob("data/embeddings/*.bin")
+
+def get_data(fn):
+    data = [int(x) for x in re.findall("\d+",fn)]
+    if not len(data) == 4:
+        return {"embedding_size":data[0],
+        "walk_length":data[1],
+        "number_of_walks":data[2],
+        "word2vec_window_size":data[3],
+        "filepath":fn,
+        "noise":data[4]
+        }
+        #raise Exception("filename should have 4 integers")
+    return {
+        "embedding_size":data[0],
+        "walk_length":data[1],
+        "number_of_walks":data[2],
+        "word2vec_window_size":data[3],
+        "filepath":fn
+    }
+    
+df = read_geonames("./data/geonamesData/FR.txt")
+df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
+
+# Create GeoDataFrame for faster spatial comparison operations
+gdf = gpd.GeoDataFrame(df)
+
+# Select a sample that concerns the departement "La Manche"
+manche_gdf = gdf[gdf.admin2_code == "50"].copy()
+
+df =pd.DataFrame([get_data(fn) for fn in fns])
+
+def get_pearsons(model):
+    manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
+    coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
+    places = list(coords.keys())
+    geodesic_d = []
+    embeddings_d = []
+    for i in tqdm(range(NPAIR),disable=True):
+        placeA=random.choice(places)
+        placeB=random.choice(places)
+        geodesic_d.append(coords[placeA].distance(coords[placeB]))
+        embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
+    return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
+
+df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
+df.fillna(0,inplace=True)
+df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno')
+plt.show()
+df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno')
+plt.show()
+df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno')
+plt.show()
\ No newline at end of file
diff --git a/templates/skeleton.html b/templates/skeleton.html
new file mode 100644
index 0000000..43fe21d
--- /dev/null
+++ b/templates/skeleton.html
@@ -0,0 +1,88 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=auto, initial-scale=1.0">
+    <meta http-equiv="X-UA-Compatible" content="ie=edge">
+    <title>Geocoder Interface</title>
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css"
+        integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">
+
+    <!-- Load Leaflet -->
+    <link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css"
+        integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA=="
+        crossorigin="" />
+    <script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js"
+        integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA=="
+        crossorigin=""></script>
+</head>
+
+<body>
+    <style>
+        body {
+            
+        }
+
+        #mapid {
+            height: 400px;
+            width: 100%;
+        }
+    </style>
+
+    <main class="container-fluid">
+        <h1 style="text-align: center;color:white;text-shadow: 1px 1px 2px black;background-color: #999;">Geocoder Demo</h1>
+        <div id="mapid"></div>
+        <div class="container" style="background-color: white;padding: 5px;">
+            <h2>Input</h2>
+            <form action="/" method="get">
+                <div class="form-group">
+                    <label for="formGroupExampleInput">Toponym</label>
+                    <input type="text" class="form-control" name="top"
+                        placeholder="Paris">
+                </div>
+                <div class="form-group">
+                    <label for="formGroupExampleInput2">Context Toponym</label>
+                    <input type="text" class="form-control" name="c_top"
+                        placeholder="Cherbourg">
+                </div>
+                <button type="submit" class="btn btn-primary">Get Coords !</button>
+            </form>
+        </div>
+    </main>
+
+    <!-- JS SCRIPTS -->
+    <script src="https://code.jquery.com/jquery-3.4.1.slim.min.js"
+        integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n"
+        crossorigin="anonymous"></script>
+    <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js"
+        integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo"
+        crossorigin="anonymous"></script>
+    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js"
+        integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6"
+        crossorigin="anonymous"></script>
+
+    <script>
+
+        // Initialize the map
+        // [50, -0.1] are the latitude and longitude
+        // 4 is the zoom
+        // mapid is the id of the div where the map will appear
+        var mymap = L
+            .map('mapid')
+            .setView([50, -0.1], 4);
+
+        // Add a tile to the map = a background. Comes from OpenStreetmap
+        L.tileLayer(
+            'http://tile.stamen.com/toner/{z}/{x}/{y}.png', {
+            attribution: 'Map data &copy; <a href="https://www.openstreetmap.org/">OpenStreetMap</a>',
+            maxZoom: 6,
+        }).addTo(mymap);
+
+        var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap);
+
+
+    </script>
+</body>
+
+</html>
\ No newline at end of file
-- 
GitLab