diff --git a/lib/geocoder.py b/lib/geocoder.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c44b13de7cd53b1812d59e135daa130b8a5272 --- /dev/null +++ b/lib/geocoder.py @@ -0,0 +1,118 @@ +# NATIVE LIB +import os + +#Â DATA LIB +import numpy as np + +#Â DL LIB +import tensorflow as tf +import keras.backend as K +from keras.models import load_model +from tensorflow.python.keras.backend import set_session +from tensorflow.python.keras.models import load_model + +# CUSTOM LIB +from lib.word_index import WordIndex +from lib.ngram_index import NgramIndex +from lib.geo import haversine_tf_1circle + +import stanza +import spacy + + +class Geocoder(object): + """ + >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") + >>>lon,lat = geocoder.get_coord("Paris","New-York") + >>>lon,lat = geocoder.wgs_coord(lon,lat) + >>>geocoder.plot_coord("Paris,New-York",lat,lon) + + if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` + """ + def __init__(self,keras_model_fn,ngram_index_file,word_index=False): + self.keras_model = load_model(keras_model_fn,custom_objects={"loss":haversine_tf_1circle},compile=False)#custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()}) + if not word_index: + self.ngram_encoder = NgramIndex.load(ngram_index_file) + else: + self.ngram_encoder = WordIndex.load(ngram_index_file) + + def get_coord(self,toponym,context_toponym): + global sess + global graph + p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) + c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) + p = np.array(p) + c = np.array(c) + coord = self.keras_model.predict([[p],[c]]) + return coord[0][0],coord[0][1] + + def get_coords(self,list_toponym,list_toponym_context): + p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym] + c = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym_context] + + p = np.array(p) + c = np.array(c) + + coords = self.keras_model.predict([p,c]) + return coords[:,0],coords[:,1] #lon lat + + def wgs_coord(self,lon,lat): + return ((lon*360)-180),((lat*180)-90) + + def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs): + if interactive_map: + import folium + import tempfile + import webbrowser + fp = tempfile.NamedTemporaryFile(delete=False) + m = folium.Map() + folium.Marker([lat, lon], popup=toponym).add_to(m) + m.save(fp.name) + webbrowser.open('file://' + fp.name) + else: + import matplotlib.pyplot as plt + import geopandas + fig, ax = plt.subplots(1,**kwargs) + world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + world.plot(color='white', edgecolor='black',ax=ax) + ax.plot(lon,lat,marker='o', color='red', markersize=5) + plt.show() + +def heuritic_mean(geocoder,data): + toponyms = data.text.unique() + input_ = np.asarray([[t1,t2] for t2 in toponyms for t1 in toponyms if t2 != t1]) + res_geocode = pd.DataFrame(input_,columns="t tc".split()) + lons,lats = geocoder.get_coords(input_[:,0],input_[:,1]) + res_geocode["lon"] = lons + res_geocode["lat"] = lats + results = {} + for tp in toponyms: + lat = res_geocode[res_geocode.tc == tp].lat.mean() + lon = res_geocode[res_geocode.tc == tp].lon.mean() + results[tp]={"lat":lat,"lon":lon} + return results + +class TextGeocoder(): + def __init__(self,geocoder_model,ner_name,lang,heuristic_func,n_jobs=None): + self.geocoder_model = geocoder_model + self.ner_name = ner_name + self.ner_model = None + if self.ner == "stanza": + self.ner_model = stanza.Pipeline(lang) + else: + self.ner_model = spacy.load(lang) + self.heuristic_func = heuristic_func + def __call__(self,a): + pass + + def extract_geo_entities(self,text): + if self.ner_model == "stanza": + entities = [{"text":en.text,"type":en.type,"start":en.start_char,"end":en.end_char} for en in self.ner_model(text).entities if en.type == "LOC"] + else: + entities = [{"text":en.text,"type":en.label_,"start":en.start_char,"end":en.end_char} for en in self.ner_model(text).ents if en.label_ in "LOC GPE".split()] + return entities + + def geocode(self,entities): + df = pd.DataFrame(entities) + results = self.heuristic_func(self.geocoder_model,df) + return results diff --git a/lib/ngram_index.py b/lib/ngram_index.py index 47a5a70005533a2106c49c3a6f0d0a9da8b7d2b4..5f386250ba40023831dcc0bf32e41729db2a7d9c 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -129,6 +129,36 @@ class NgramIndex(): embedding_matrix[i] = model.wv[str(i)] return embedding_matrix + def get_glove_embedding_layer(self,texts,dim=100,**kwargs): + """ + Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model. + + Parameters + ---------- + texts : list of [list of int] + list of encoded word + dim : int, optional + embedding dimension, by default 100 + + Returns + ------- + np.array + embedding matrix + """ + from glove import Corpus, Glove + corpus = Corpus() + corpus.fit([[str(w) for w in t] for t in texts], window=10) + glove = Glove(no_components=dim, learning_rate=0.05) + glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) + glove.add_dictionary(corpus.dictionary) + N = len(self.ngram_index) + embedding_matrix = np.zeros((N,dim)) + for i in range(N): + if str(i) in glove.dictionary: + embedding_matrix[i] = glove.word_vectors[glove.dictionary[str(i)]] + return embedding_matrix + + def save(self,fn): """ diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py index 6ed8169cd5549ffbbaa02d4ecc0b58e0b698311e..6ed64582e6aec435c9c06bbcbfdcf37d12fa2959 100644 --- a/predict_toponym_coordinates.py +++ b/predict_toponym_coordinates.py @@ -10,6 +10,7 @@ from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.models import load_model +from lib.geo import haversine_tf_1circle sess = None graph = None @@ -64,8 +65,8 @@ class Geocoder(object): # sess = tf.compat.v1.Session() # graph = tf.compat.v1.get_default_graph() # set_session(sess) - self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()}) - self.ngram_encoder = WordIndex.load(ngram_index_file) + self.keras_model = load_model(keras_model_fn,custom_objects={"loss":haversine_tf_1circle},compile=False)#custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()}) + self.ngram_encoder = NgramIndex.load(ngram_index_file) def get_coord(self,toponym,context_toponym): global sess @@ -76,8 +77,8 @@ class Geocoder(object): c = np.array(c) # with sess.as_default(): # with graph.as_default(): - lon,lat = self.keras_model.predict([[p],[c]]) - return lon[0][0],lat[0][0] + coord = self.keras_model.predict([[p],[c]]) + return coord[0][0],coord[0][1] def get_coords(self,list_toponym,list_toponym_context): p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym] @@ -86,8 +87,8 @@ class Geocoder(object): p = np.array(p) c = np.array(c) - lon,lat = self.keras_model.predict([p,c]) - return lon,lat + coords = self.keras_model.predict([p,c]) + return coords[0],coords[1] def wgs_coord(self,lon,lat): return ((lon*360)-180),((lat*180)-90) diff --git a/run_train.py b/run_train.py index 3030694f6f0b5df0589c5509ed7bab17d68cd0e8..057b91df9ab8162bf47f19e377e2678a9d3e795b 100644 --- a/run_train.py +++ b/run_train.py @@ -1,32 +1,22 @@ from lib.run import GridSearchModel from collections import OrderedDict +c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt" -# Build all combination of relations -rels = ["-i","-a","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt"] -combinations = [] -for rel in rels: - combinations.append(rel) - for rel2 in rels: - if not rel == rel2: - if not rel2+ " " + rel in combinations: - combinations.append(rel+ " " + rel2) - -c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt" # Init GridsearchModel grid = GridSearchModel(\ - "python3 combination_embeddings_baselines.py", + "python3 combination_embeddingsv3.py", **OrderedDict({ # necessary because some args have to be given in a certain order "rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a" "-n":[4], - "--ngram-word2vec-iter" :[10], + "--ngram-word2vec-iter" :[100], "-e":[100], - "geoname_fn":"../data/geonamesData/US_FR.txt".split(), + "geoname_fn":"../data/geonamesData/FR.txt".split(), "hierarchy_fn":"../data/geonamesData/hierarchy.txt".split() }.items())) print("########### THE FOLLOWING COMMAND(S) WILL BE EXECUTED ###########" ) [print(task.get_command()) for task in grid.tasks] print("#################################################################") -grid.run("outputs/log_{0}".format("FR_baseline")) +grid.run("outputs/log_{0}".format("FR_model_v2")) #["-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -a","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -i"] \ No newline at end of file diff --git a/server.py b/server.py new file mode 100644 index 0000000000000000000000000000000000000000..0b59ac9ec89bd56a499887db8d8c010934148f1b --- /dev/null +++ b/server.py @@ -0,0 +1,20 @@ +from flask import Flask, escape, request, render_template +from lib.geocoder import Geocoder + +geocoder = Geocoder("./outputs/GB_MODEL_2/GB.txt_100_4_100__A_I_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__A_I_C_index") +app = Flask(__name__) + +@app.route('/') +def home(): + toponym = request.args.get("top", "Paris") + c_toponym = request.args.get("c_top", "Cherbourg") + if toponym and c_toponym: + lon,lat = geocoder.get_coord(toponym,c_toponym) + lon,lat = geocoder.wgs_coord(lon,lat) + return render_template("pair_topo.html",lat=lat,lon=lon,title="Toponyms Pair Geocoder") + else: + return render_template("pair_topo.html",title="Toponyms Pair Geocoder") + +@app.route('/text') +def text(): + return render_template("text.html",title="Text Geocoder") \ No newline at end of file diff --git a/templates/pair_topo.html b/templates/pair_topo.html new file mode 100644 index 0000000000000000000000000000000000000000..d222fe995c76a84b30c640bb41b2d0efece317f6 --- /dev/null +++ b/templates/pair_topo.html @@ -0,0 +1,49 @@ +{% extends 'skeleton.html' %} + +{% block content %} +<!-- MAP RENDER --> +<div id="mapid"></div> + +<!-- TOPONYM FORM --> +<div class="container" style="background-color: white;padding: 5px;"> + <p> + Lorem ipsum dolor sit amet consectetur adipisicing elit. Eum aliquid similique corporis, consequatur dicta itaque commodi ex est ad nemo atque dolore voluptatibus quidem totam? Ut incidunt quos similique veniam. + </p> + <form action="/" method="get"> + <div class="form-group"> + <label for="formGroupExampleInput">Toponym</label> + <input type="text" class="form-control" name="top" + placeholder="Paris"> + </div> + <div class="form-group"> + <label for="formGroupExampleInput2">Context Toponym</label> + <input type="text" class="form-control" name="c_top" + placeholder="Cherbourg"> + </div> + <button type="submit" class="btn btn-primary">Get Coords !</button> + </form> +</div> +{% endblock %} + +{% block script %} +<script> + + // Initialize the map + // [50, -0.1] are the latitude and longitude + // 4 is the zoom + // mapid is the id of the div where the map will appear + var mymap = L + .map('mapid') + .setView([50, -0.1], 4); + + // Add a tile to the map = a background. Comes from OpenStreetmap + L.tileLayer( + 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', { + attribution: 'Map data © <a href="https://www.openstreetmap.org/">OpenStreetMap</a>', + }).addTo(mymap); + + var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); + + +</script> +{% endblock %} \ No newline at end of file diff --git a/templates/skeleton.html b/templates/skeleton.html index 43fe21d207d3ebd53e955efdfc0ab9dedfa36081..fa7a527395875c2dc778877cc29abcbf7c3f0a1e 100644 --- a/templates/skeleton.html +++ b/templates/skeleton.html @@ -9,6 +9,8 @@ <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous"> + <link href="https://fonts.googleapis.com/css2?family=Kumbh+Sans:wght@300;400;700&display=swap" rel="stylesheet"> + <!-- Load Leaflet --> <link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" @@ -21,34 +23,34 @@ <body> <style> body { - + font-family: 'Kumbh Sans', sans-serif; } #mapid { height: 400px; width: 100%; } + .container-fluid{ + padding: 0 !important; + } </style> <main class="container-fluid"> - <h1 style="text-align: center;color:white;text-shadow: 1px 1px 2px black;background-color: #999;">Geocoder Demo</h1> - <div id="mapid"></div> - <div class="container" style="background-color: white;padding: 5px;"> - <h2>Input</h2> - <form action="/" method="get"> - <div class="form-group"> - <label for="formGroupExampleInput">Toponym</label> - <input type="text" class="form-control" name="top" - placeholder="Paris"> - </div> - <div class="form-group"> - <label for="formGroupExampleInput2">Context Toponym</label> - <input type="text" class="form-control" name="c_top" - placeholder="Cherbourg"> - </div> - <button type="submit" class="btn btn-primary">Get Coords !</button> - </form> - </div> + <!-- NAVBAR --> + <nav class="navbar navbar-expand-lg navbar-light bg-light"> + <a class="navbar-brand" href="#">Geocoding using pair of toponyms</a> + <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNavAltMarkup" aria-controls="navbarNavAltMarkup" aria-expanded="false" aria-label="Toggle navigation"> + <span class="navbar-toggler-icon"></span> + </button> + <div class="collapse navbar-collapse" id="navbarNavAltMarkup"> + <div class="navbar-nav"> + <a class="nav-link" href="/">Toponyms Pair Geocoder</a> + <a class="nav-link" href="/text">Text Geocoder</a> + </div> + </div> + </nav> + <h2 class="text-center" style="margin-top: 0.5em;">{{title}}</h2> + {% block content %}{% endblock %} </main> <!-- JS SCRIPTS --> @@ -62,27 +64,8 @@ integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6" crossorigin="anonymous"></script> - <script> - - // Initialize the map - // [50, -0.1] are the latitude and longitude - // 4 is the zoom - // mapid is the id of the div where the map will appear - var mymap = L - .map('mapid') - .setView([50, -0.1], 4); - - // Add a tile to the map = a background. Comes from OpenStreetmap - L.tileLayer( - 'http://tile.stamen.com/toner/{z}/{x}/{y}.png', { - attribution: 'Map data © <a href="https://www.openstreetmap.org/">OpenStreetMap</a>', - maxZoom: 6, - }).addTo(mymap); - - var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); - - - </script> + {% block script%} + {% endblock %} </body> </html> \ No newline at end of file diff --git a/templates/text.html b/templates/text.html new file mode 100644 index 0000000000000000000000000000000000000000..53aff30259b13535e617b7ba923c7f384e29dc19 --- /dev/null +++ b/templates/text.html @@ -0,0 +1,32 @@ +{% extends 'skeleton.html' %} + +{% block content %} +<!-- MAP RENDER --> +<div id="mapid"></div> + +<!-- TOPONYM FORM --> +<p>Lorem ipsum dolor sit, amet consectetur adipisicing elit. Laborum, modi natus ab repellendus ex explicabo, sunt fuga commodi porro ipsam deserunt facilis culpa aspernatur odio tenetur quibusdam perferendis ipsum cupiditate?</p> +{% endblock %} + +{% block script %} +<script> + + // Initialize the map + // [50, -0.1] are the latitude and longitude + // 4 is the zoom + // mapid is the id of the div where the map will appear + var mymap = L + .map('mapid') + .setView([50, -0.1], 4); + + // Add a tile to the map = a background. Comes from OpenStreetmap + L.tileLayer( + 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', { + attribution: 'Map data © <a href="https://www.openstreetmap.org/">OpenStreetMap</a>', + }).addTo(mymap); + + var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); + + +</script> +{% endblock %} \ No newline at end of file