Skip to content
Snippets Groups Projects
geocoder.py 4.77 KiB
# NATIVE LIB
import os

# DATA LIB
import numpy as np
import pandas as pd

# DL LIB
import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model

# CUSTOM LIB
from lib.word_index import WordIndex
from lib.ngram_index import NgramIndex
from lib.geo import haversine_tf_1circle

import stanza
import spacy


class Geocoder(object):
    """
    >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt")
    >>>lon,lat = geocoder.get_coord("Paris","New-York")
    >>>lon,lat = geocoder.wgs_coord(lon,lat)
    >>>geocoder.plot_coord("Paris,New-York",lat,lon)

    if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()`
    """
    def __init__(self,keras_model_fn,ngram_index_file,word_index=False):
        self.keras_model = load_model(keras_model_fn,custom_objects={"loss":haversine_tf_1circle},compile=False)#custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
        if not word_index:
            self.ngram_encoder = NgramIndex.load(ngram_index_file)
        else:
            self.ngram_encoder = WordIndex.load(ngram_index_file)

    def get_coord(self,toponym,context_toponym):
        global sess
        global graph
        p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len)
        c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len)
        p = np.array(p)
        c = np.array(c)       
        coord = self.keras_model.predict([[p],[c]])
        return coord[0][0],coord[0][1]
    
    def get_coords(self,list_toponym,list_toponym_context):
        p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym]
        c = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym_context]

        p = np.array(p)
        c = np.array(c)
        
        coords = self.keras_model.predict([p,c])
        return coords[:,0],coords[:,1] #lon lat

    def wgs_coord(self,lon,lat):
        return ((lon*360)-180),((lat*180)-90)
    
    def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs):
        if interactive_map:
            import folium
            import tempfile
            import webbrowser
            fp = tempfile.NamedTemporaryFile(delete=False)
            m = folium.Map()
            folium.Marker([lat, lon], popup=toponym).add_to(m)
            m.save(fp.name)
            webbrowser.open('file://' + fp.name)
        else:
            import matplotlib.pyplot as plt
            import geopandas
            fig, ax = plt.subplots(1,**kwargs)
            world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
            world.plot(color='white', edgecolor='black',ax=ax)
            ax.plot(lon,lat,marker='o', color='red', markersize=5)
            plt.show()

def heuritic_mean(geocoder,data):
    toponyms = data.text.unique()
    input_ = np.asarray([[t1,t2] for t2 in toponyms for t1 in toponyms if t2 != t1])
    res_geocode = pd.DataFrame(input_,columns="t tc".split())
    lons,lats = geocoder.wgs_coord(*geocoder.get_coords(input_[:,0],input_[:,1]))
    res_geocode["lon"] = lons
    res_geocode["lat"] = lats
    results = {}
    for tp in toponyms:
        lat = res_geocode[res_geocode.t == tp].lat.mean()
        lon = res_geocode[res_geocode.t == tp].lon.mean()
        results[tp]={"lat":lat,"lon":lon}
    return results
    
class TextGeocoder():
    def __init__(self,geocoder_model,ner_name,lang,heuristic_func,n_jobs=None):
        self.geocoder_model = geocoder_model
        self.ner_name = ner_name
        self.ner_model = None
        if self.ner_name == "stanza":
            self.ner_model = stanza.Pipeline(lang)
        else:
            self.ner_model = spacy.load(lang)
            self.heuristic_func = heuristic_func
    def __call__(self,a):
        pass

    def extract_geo_entities(self,text):
        if self.ner_model == "stanza":
            entities = [{"text":en.text,"type":en.type,"start":en.start_char,"end":en.end_char} for en in self.ner_model(text).entities  if en.type == "LOC"]
        else:
            entities = [{"text":en.text,"type":en.label_,"start":en.start_char,"end":en.end_char} for en in self.ner_model(text).ents if en.label_ in "LOC GPE".split()]
        return entities

    def geocode(self,entities):
        df = pd.DataFrame(entities)
        heuristic_results = self.heuristic_func(self.geocoder_model,df)
        for e in range(len(entities)):
            entities[e]["coord"] = heuristic_results[entities[e]["text"]]
        return entities