Skip to content
Snippets Groups Projects
Commit b00c4eb4 authored by Fize Jacques's avatar Fize Jacques
Browse files

Update

parent b7df4531
No related branches found
No related tags found
No related merge requests found
from glob import glob
import json
import argparse
import logging
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument("eval_dataset")
parser.add_argument("models_directory")
parser.add_argument("-g","--gpu",action="store_true")
args = parser.parse_args()#("-g ../data/geocoding_evaluation/fr_cooc_test.csv outputs/FR_RESULT".split())
if not args.gpu:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # No need for GPU
from predict_toponym_coordinates import Geocoder
from lib.utils_geo import haversine_pd
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)
logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL)
EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv"
def eval_model(eval_dataset_fn,model_fn,model_index_fn):
print("Dataset -- {0} -- Model -- {1}".format(\
eval_dataset_fn.split("/")[-1],
model_fn.split("/")[-1]))
df = pd.read_csv(eval_dataset_fn)
geocoder = Geocoder(model_fn,model_index_fn)
lon,lat = geocoder.get_coords(df.name1.values,df.name2.values)
lon,lat = geocoder.wgs_coord(lon,lat)
df["p_longitude"] = lon
df["p_latitude"] = lat
df["dist"] = haversine_pd(df.longitude,df.latitude,df.p_longitude,df.p_latitude)
print("100km",(df.dist<100).sum()/len(df))
print("50km",(df.dist<50).sum()/len(df))
print("20km",(df.dist<20).sum()/len(df))
return df
prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")]
final_output = []
for prefix in prefixes:
try:
df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
data = json.load(open(prefix+".json"))
data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
final_output.append(data)
except:
print("BUMP!")
pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv")))
\ No newline at end of file
python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT
python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/USFR_WORD
python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/USFR_WORD
from keras.models import load_model
import os
import tensorflow as tf
import keras.backend as K
from lib.ngram_index import NgramIndex
from lib.word_index import WordIndex
import numpy as np
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model
from lib.utils_geo import haversine_tf_1circle
sess = None
graph = None
def lat_accuracy(LAT_TOL =1/180.):
def accuracy_at_k_lat(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lat
def lon_accuracy(LON_TOL=1/360.):
def accuracy_at_k_lon(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lon
class Geocoder(object):
"""
>>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt")
>>>lon,lat = geocoder.get_coord("Paris","New-York")
>>>lon,lat = geocoder.wgs_coord(lon,lat)
>>>geocoder.plot_coord("Paris,New-York",lat,lon)
if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()`
"""
def __init__(self,keras_model_fn,ngram_index_file):
# global sess
# global graph
# sess = tf.compat.v1.Session()
# graph = tf.compat.v1.get_default_graph()
# set_session(sess)
self.keras_model = load_model(keras_model_fn,custom_objects={"loss":haversine_tf_1circle},compile=False)#custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
self.ngram_encoder = NgramIndex.load(ngram_index_file)
def get_coord(self,toponym,context_toponym):
global sess
global graph
p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len)
c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len)
p = np.array(p)
c = np.array(c)
# with sess.as_default():
# with graph.as_default():
coord = self.keras_model.predict([[p],[c]])
return coord[0][0],coord[0][1]
def get_coords(self,list_toponym,list_toponym_context):
p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym]
c = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym_context]
p = np.array(p)
c = np.array(c)
coords = self.keras_model.predict([p,c])
return coords[0],coords[1]
def wgs_coord(self,lon,lat):
return ((lon*360)-180),((lat*180)-90)
def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs):
if interactive_map:
import folium
import tempfile
import webbrowser
fp = tempfile.NamedTemporaryFile(delete=False)
m = folium.Map()
folium.Marker([lat, lon], popup=toponym).add_to(m)
m.save(fp.name)
webbrowser.open('file://' + fp.name)
else:
import matplotlib.pyplot as plt
import geopandas
fig, ax = plt.subplots(1,**kwargs)
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.plot(color='white', edgecolor='black',ax=ax)
ax.plot(lon,lat,marker='o', color='red', markersize=5)
plt.show()
if __name__ == "__main__":
from flask import Flask, escape, request, render_template
app = Flask(__name__)
geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index")
@app.route('/',methods=["GET"])
def display():
toponym = request.args.get("top", "Paris")
c_toponym = request.args.get("c_top", "Cherbourg")
lon,lat = geocoder.get_coord(toponym,c_toponym)
lon,lat = geocoder.wgs_coord(lon,lat)
return render_template("skeleton.html",lat=lat,lon=lon)
app.run(host='0.0.0.0')
\ No newline at end of file
# Base module
import os
# Structure
import pandas as pd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.utils_geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
from helpers import EpochTimer
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.k_value
EPOCHS = args.epochs
ADJACENCY_SAMPLING = args.adjacency_sample
COOC_SAMPLING = args.cooc_sample
WORDVEC_ITER = 50
EMBEDDING_DIM = args.dimension
BATCH_SIZE = args.batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.inclusion_fn
ADJACENCY_REL_FILENAME = args.adjacency_fn
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "REGION_{0}_{1}_{2}_{3}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING,
ADJACENCY_SAMPLING,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
train_src.append(a_train)
test_src.append(a_test)
if args.inclusion:
i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
train_src.append(i_train)
test_src.append(i_test)
#Adjacency
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from keras import regularizers
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x1,x2])#,x3])
#x = Dense(class_encoder.get_num_classes()*2,activation="relu")(x)
aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
model = Model(inputs = [input_1,input_2], outputs = aux_layer)#input_3
model.compile(loss={"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy"})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
import argparse
import pandas as pd
import numpy as np
import geopandas as gpd
import logging
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
from sklearn.model_selection import train_test_split
from shapely.geometry import Point
from lib.utils_geo import latlon2healpix
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("cooccurrence_file")
parser.add_argument("-s",action="store_true")
args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split())
# LOAD DATAgeopandas
COOC_FN = args.cooccurrence_file
logging.info("Load Cooc DATA data...")
cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("")
logging.info("Cooc data loaded!")
cooc_data["cat"] = cooc_data.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1)
# TRAIN AND TEST SPLIT
logging.info("Split Between Train and Test")
# Cell can be empty
i=0
while 1:
if len(cooc_data[cooc_data.cat == i])> 1:
X_train,X_test = train_test_split(cooc_data[cooc_data.cat == i])
break
i+=1
for i in np.unique(cooc_data.cat.values):
try:
if not args.s:
x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i])
else:
x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i].sample(frac=0.1))
X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
except Exception as e:
print(e) #print("Error",len(filtered[filtered.cat == i]))
del X_train["cat"]
del X_test["cat"]
# SAVING THE DATA
logging.info("Saving Output !")
suffix =""
if args.s:
suffix = "10per"
X_train.to_csv(COOC_FN+suffix+"_train.csv")
X_test.to_csv(COOC_FN+suffix+"_test.csv")
import argparse
import numpy as np
import pandas as pd
import geopandas as gpd
import logging
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
from sklearn.model_selection import train_test_split
from lib.utils_geo import latlon2healpix
from helpers import read_geonames
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("geoname_file")
parser.add_argument("--feature_classes",help="List of class",default="A P")
args = parser.parse_args()#("data/geonamesData/FR.txt".split())
# LOAD DATAgeopandas
GEONAME_FN = args.geoname_file
FEATURE_CLASSES = args.feature_classes
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places
filtered["cat"] = filtered.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1)
# TRAIN AND TEST SPLIT
logging.info("Split Between Train and Test")
# Cell can be empty
cat_unique = filtered.cat.unique()
ci=0
while 1:
if len(filtered[filtered.cat == cat_unique[ci]])> 1:
X_train,X_test = train_test_split(filtered[filtered.cat == cat_unique[ci]])
break
ci+=1
for i in cat_unique[ci:] :
try:
x_train,x_test = train_test_split(filtered[filtered.cat == i])
X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
except:
pass #print("Error",len(filtered[filtered.cat == i]))
del X_train["cat"]
del X_test["cat"]
# SAVING THE DATA
logging.info("Saving Output !")
X_train.to_csv(GEONAME_FN+"_train.csv")
X_test.to_csv(GEONAME_FN+"_test.csv")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment