Something went wrong on our end
-
Fize Jacques authored0c92ec36
geocoder.py 4.77 KiB
# NATIVE LIB
import os
# DATA LIB
import numpy as np
import pandas as pd
# DL LIB
import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model
# CUSTOM LIB
from lib.word_index import WordIndex
from lib.ngram_index import NgramIndex
from lib.geo import haversine_tf_1circle
import stanza
import spacy
class Geocoder(object):
"""
>>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt")
>>>lon,lat = geocoder.get_coord("Paris","New-York")
>>>lon,lat = geocoder.wgs_coord(lon,lat)
>>>geocoder.plot_coord("Paris,New-York",lat,lon)
if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()`
"""
def __init__(self,keras_model_fn,ngram_index_file,word_index=False):
self.keras_model = load_model(keras_model_fn,custom_objects={"loss":haversine_tf_1circle},compile=False)#custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
if not word_index:
self.ngram_encoder = NgramIndex.load(ngram_index_file)
else:
self.ngram_encoder = WordIndex.load(ngram_index_file)
def get_coord(self,toponym,context_toponym):
global sess
global graph
p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len)
c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len)
p = np.array(p)
c = np.array(c)
coord = self.keras_model.predict([[p],[c]])
return coord[0][0],coord[0][1]
def get_coords(self,list_toponym,list_toponym_context):
p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym]
c = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym_context]
p = np.array(p)
c = np.array(c)
coords = self.keras_model.predict([p,c])
return coords[:,0],coords[:,1] #lon lat
def wgs_coord(self,lon,lat):
return ((lon*360)-180),((lat*180)-90)
def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs):
if interactive_map:
import folium
import tempfile
import webbrowser
fp = tempfile.NamedTemporaryFile(delete=False)
m = folium.Map()
folium.Marker([lat, lon], popup=toponym).add_to(m)
m.save(fp.name)
webbrowser.open('file://' + fp.name)
else:
import matplotlib.pyplot as plt
import geopandas
fig, ax = plt.subplots(1,**kwargs)
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.plot(color='white', edgecolor='black',ax=ax)
ax.plot(lon,lat,marker='o', color='red', markersize=5)
plt.show()
def heuritic_mean(geocoder,data):
toponyms = data.text.unique()
input_ = np.asarray([[t1,t2] for t2 in toponyms for t1 in toponyms if t2 != t1])
res_geocode = pd.DataFrame(input_,columns="t tc".split())
lons,lats = geocoder.wgs_coord(*geocoder.get_coords(input_[:,0],input_[:,1]))
res_geocode["lon"] = lons
res_geocode["lat"] = lats
results = {}
for tp in toponyms:
lat = res_geocode[res_geocode.t == tp].lat.mean()
lon = res_geocode[res_geocode.t == tp].lon.mean()
results[tp]={"lat":lat,"lon":lon}
return results
class TextGeocoder():
def __init__(self,geocoder_model,ner_name,lang,heuristic_func,n_jobs=None):
self.geocoder_model = geocoder_model
self.ner_name = ner_name
self.ner_model = None
if self.ner_name == "stanza":
self.ner_model = stanza.Pipeline(lang)
else:
self.ner_model = spacy.load(lang)
self.heuristic_func = heuristic_func
def __call__(self,a):
pass
def extract_geo_entities(self,text):
if self.ner_model == "stanza":
entities = [{"text":en.text,"type":en.type,"start":en.start_char,"end":en.end_char} for en in self.ner_model(text).entities if en.type == "LOC"]
else:
entities = [{"text":en.text,"type":en.label_,"start":en.start_char,"end":en.end_char} for en in self.ner_model(text).ents if en.label_ in "LOC GPE".split()]
return entities
def geocode(self,entities):
df = pd.DataFrame(entities)
heuristic_results = self.heuristic_func(self.geocoder_model,df)
for e in range(len(entities)):
entities[e]["coord"] = heuristic_results[entities[e]["text"]]
return entities