diff --git a/desamb_eval.py b/desamb_eval.py deleted file mode 100644 index 8fcd5febb04f7df4938dd5634949f15fc6ac3ce0..0000000000000000000000000000000000000000 --- a/desamb_eval.py +++ /dev/null @@ -1,66 +0,0 @@ -from glob import glob -import json - -import argparse -import logging - -import pandas as pd - - - -parser = argparse.ArgumentParser() -parser.add_argument("eval_dataset") -parser.add_argument("models_directory") -parser.add_argument("-g","--gpu",action="store_true") -args = parser.parse_args()#("-g ../data/geocoding_evaluation/fr_cooc_test.csv outputs/FR_RESULT".split()) - - -if not args.gpu: - import os - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # No need for GPU - - -from predict_toponym_coordinates import Geocoder -from lib.utils_geo import haversine_pd - -logging.getLogger("tensorflow").setLevel(logging.CRITICAL) -logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL) - -EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv" - - -def eval_model(eval_dataset_fn,model_fn,model_index_fn): - print("Dataset -- {0} -- Model -- {1}".format(\ - eval_dataset_fn.split("/")[-1], - model_fn.split("/")[-1])) - df = pd.read_csv(eval_dataset_fn) - geocoder = Geocoder(model_fn,model_index_fn) - lon,lat = geocoder.get_coords(df.name1.values,df.name2.values) - lon,lat = geocoder.wgs_coord(lon,lat) - - df["p_longitude"] = lon - df["p_latitude"] = lat - - df["dist"] = haversine_pd(df.longitude,df.latitude,df.p_longitude,df.p_latitude) - - print("100km",(df.dist<100).sum()/len(df)) - print("50km",(df.dist<50).sum()/len(df)) - print("20km",(df.dist<20).sum()/len(df)) - return df - -prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")] - -final_output = [] -for prefix in prefixes: - try: - df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index") - data = json.load(open(prefix+".json")) - data["acccuracy@100km"] = (df.dist<100).sum()/len(df) - data["acccuracy@50km"] = (df.dist<50).sum()/len(df) - data["acccuracy@25km"] = (df.dist<25).sum()/len(df) - final_output.append(data) - except: - print("BUMP!") - - -pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv"))) \ No newline at end of file diff --git a/desamb_eval_runs.sh b/desamb_eval_runs.sh deleted file mode 100644 index 20eb682c6a8e00856a6d20d05100452d913b13d0..0000000000000000000000000000000000000000 --- a/desamb_eval_runs.sh +++ /dev/null @@ -1,3 +0,0 @@ -python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT -python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/USFR_WORD -python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/USFR_WORD diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py deleted file mode 100644 index ec5d967b0a4f4c5f473147aa96bb66c7bc3737cf..0000000000000000000000000000000000000000 --- a/predict_toponym_coordinates.py +++ /dev/null @@ -1,133 +0,0 @@ -from keras.models import load_model -import os -import tensorflow as tf -import keras.backend as K -from lib.ngram_index import NgramIndex -from lib.word_index import WordIndex -import numpy as np - -from tensorflow.python.keras.backend import set_session -from tensorflow.python.keras.models import load_model - - -from lib.utils_geo import haversine_tf_1circle -sess = None -graph = None - -def lat_accuracy(LAT_TOL =1/180.): - def accuracy_at_k_lat(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64) - return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) - return accuracy_at_k_lat - -def lon_accuracy(LON_TOL=1/360.): - def accuracy_at_k_lon(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64) - return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) - return accuracy_at_k_lon - -class Geocoder(object): - """ - >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") - >>>lon,lat = geocoder.get_coord("Paris","New-York") - >>>lon,lat = geocoder.wgs_coord(lon,lat) - >>>geocoder.plot_coord("Paris,New-York",lat,lon) - - if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` - """ - def __init__(self,keras_model_fn,ngram_index_file): - # global sess - # global graph - # sess = tf.compat.v1.Session() - # graph = tf.compat.v1.get_default_graph() - # set_session(sess) - self.keras_model = load_model(keras_model_fn,custom_objects={"loss":haversine_tf_1circle},compile=False)#custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()}) - self.ngram_encoder = NgramIndex.load(ngram_index_file) - - def get_coord(self,toponym,context_toponym): - global sess - global graph - p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) - c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) - p = np.array(p) - c = np.array(c) - # with sess.as_default(): - # with graph.as_default(): - coord = self.keras_model.predict([[p],[c]]) - return coord[0][0],coord[0][1] - - def get_coords(self,list_toponym,list_toponym_context): - p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym] - c = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym_context] - - p = np.array(p) - c = np.array(c) - - coords = self.keras_model.predict([p,c]) - return coords[0],coords[1] - - def wgs_coord(self,lon,lat): - return ((lon*360)-180),((lat*180)-90) - - def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs): - if interactive_map: - import folium - import tempfile - import webbrowser - fp = tempfile.NamedTemporaryFile(delete=False) - m = folium.Map() - folium.Marker([lat, lon], popup=toponym).add_to(m) - m.save(fp.name) - webbrowser.open('file://' + fp.name) - else: - import matplotlib.pyplot as plt - import geopandas - fig, ax = plt.subplots(1,**kwargs) - world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) - world.plot(color='white', edgecolor='black',ax=ax) - ax.plot(lon,lat,marker='o', color='red', markersize=5) - plt.show() - - - -if __name__ == "__main__": - from flask import Flask, escape, request, render_template - - app = Flask(__name__) - - - geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index") - - @app.route('/',methods=["GET"]) - def display(): - toponym = request.args.get("top", "Paris") - c_toponym = request.args.get("c_top", "Cherbourg") - lon,lat = geocoder.get_coord(toponym,c_toponym) - lon,lat = geocoder.wgs_coord(lon,lat) - return render_template("skeleton.html",lat=lat,lon=lon) - - app.run(host='0.0.0.0') \ No newline at end of file diff --git a/region_model.py b/region_model.py deleted file mode 100644 index b7a66738ccc07b3403da4da5772b55afa7c816c5..0000000000000000000000000000000000000000 --- a/region_model.py +++ /dev/null @@ -1,199 +0,0 @@ -# Base module -import os - -#Â Structure -import pandas as pd - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout -from keras.models import Model -from keras.callbacks import ModelCheckpoint -from tensorflow.keras.layers import Lambda -import keras.backend as K -import tensorflow as tf -from lib.custom_layer import * - -#Â Custom module -from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder -from lib.metrics import lat_accuracy,lon_accuracy -from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency -from lib.utils_geo import haversine_tf,accuracy_k,haversine_tf_1circle - -# Logging -import logging - -logging.getLogger('gensim').setLevel(logging.WARNING) - -from helpers import EpochTimer - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) - -#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.k_value -EPOCHS = args.epochs -ADJACENCY_SAMPLING = args.adjacency_sample -COOC_SAMPLING = args.cooc_sample -WORDVEC_ITER = 50 -EMBEDDING_DIM = args.dimension -BATCH_SIZE = args.batch_size -################################################# -########## FILENAME VARIABLE #################### -################################################# -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.inclusion_fn -ADJACENCY_REL_FILENAME = args.adjacency_fn -COOC_FN = args.wikipedia_cooc_fn - -PREFIX_OUTPUT_FN = "REGION_{0}_{1}_{2}_{3}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - - -meta_data = MetaDataSerializer( - DATASET_NAME, - REL_CODE, - COOC_SAMPLING, - ADJACENCY_SAMPLING, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - - -###Â PUT DATASRC + GENERATOR - -index = NgramIndex.load(args.ngram_index_fn) - -train_src = [] -test_src = [] - -class_encoder = LabelEncoder() -if args.wikipedia_cooc: - train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4)) - test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4)) - -if args.adjacency: - a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) - a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) - train_src.append(a_train) - test_src.append(a_test) - -if args.inclusion: - i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") - i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") - train_src.append(i_train) - test_src.append(i_test) -#Adjacency - - - -d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True) -d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True) - -num_words = len(index.index_ngram) - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - -embedding_weights = load_embedding(args.embedding_fn) - - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - -from keras import regularizers - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -#Â Each LSTM learn on a permutation of the input toponyms -biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh")) -x1 = biLSTM(x1) -x2 = biLSTM(x2) -x = concatenate([x1,x2])#,x3]) - -#x = Dense(class_encoder.get_num_classes()*2,activation="relu")(x) - - -aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x) - -model = Model(inputs = [input_1,input_2], outputs = aux_layer)#input_3 - -model.compile(loss={"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy"}) - - -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - - -history = model.fit_generator(generator=d_train, - validation_data=d_test, - verbose=True, - epochs=EPOCHS, - callbacks=[checkpoint,epoch_timer]) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py deleted file mode 100644 index 47fb607a8b2a64eade2da20d1a3de7159fff5f18..0000000000000000000000000000000000000000 --- a/train_test_split_cooccurrence_data.py +++ /dev/null @@ -1,68 +0,0 @@ -import argparse - -import pandas as pd -import numpy as np -import geopandas as gpd - -import logging -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -from sklearn.model_selection import train_test_split -from shapely.geometry import Point - -from lib.utils_geo import latlon2healpix - -from tqdm import tqdm - -parser = argparse.ArgumentParser() -parser.add_argument("cooccurrence_file") -parser.add_argument("-s",action="store_true") - -args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split()) - -# LOAD DATAgeopandas -COOC_FN = args.cooccurrence_file - -logging.info("Load Cooc DATA data...") -cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("") -logging.info("Cooc data loaded!") - - -cooc_data["cat"] = cooc_data.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1) - -# TRAIN AND TEST SPLIT -logging.info("Split Between Train and Test") - -# Cell can be empty -i=0 -while 1: - if len(cooc_data[cooc_data.cat == i])> 1: - X_train,X_test = train_test_split(cooc_data[cooc_data.cat == i]) - break - i+=1 - -for i in np.unique(cooc_data.cat.values): - try: - if not args.s: - x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i]) - else: - x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i].sample(frac=0.1)) - - X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) - except Exception as e: - print(e) #print("Error",len(filtered[filtered.cat == i])) - -del X_train["cat"] -del X_test["cat"] - -# SAVING THE DATA -logging.info("Saving Output !") -suffix ="" -if args.s: - suffix = "10per" -X_train.to_csv(COOC_FN+suffix+"_train.csv") -X_test.to_csv(COOC_FN+suffix+"_test.csv") diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py deleted file mode 100644 index 9aaf44907cbe713af2570c4256ad27d447c6ad97..0000000000000000000000000000000000000000 --- a/train_test_split_geonames.py +++ /dev/null @@ -1,66 +0,0 @@ -import argparse - -import numpy as np -import pandas as pd -import geopandas as gpd - -import logging -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -from sklearn.model_selection import train_test_split - -from lib.utils_geo import latlon2healpix -from helpers import read_geonames - -from tqdm import tqdm - -parser = argparse.ArgumentParser() -parser.add_argument("geoname_file") -parser.add_argument("--feature_classes",help="List of class",default="A P") - -args = parser.parse_args()#("data/geonamesData/FR.txt".split()) - -# LOAD DATAgeopandas -GEONAME_FN = args.geoname_file -FEATURE_CLASSES = args.feature_classes - - -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places - -filtered["cat"] = filtered.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1) -# TRAIN AND TEST SPLIT -logging.info("Split Between Train and Test") - -# Cell can be empty -cat_unique = filtered.cat.unique() -ci=0 -while 1: - if len(filtered[filtered.cat == cat_unique[ci]])> 1: - X_train,X_test = train_test_split(filtered[filtered.cat == cat_unique[ci]]) - break - ci+=1 - -for i in cat_unique[ci:] : - try: - x_train,x_test = train_test_split(filtered[filtered.cat == i]) - X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) - except: - pass #print("Error",len(filtered[filtered.cat == i])) - - -del X_train["cat"] -del X_test["cat"] - -# SAVING THE DATA -logging.info("Saving Output !") -X_train.to_csv(GEONAME_FN+"_train.csv") -X_test.to_csv(GEONAME_FN+"_test.csv") \ No newline at end of file