diff --git a/combination_embeddings.py b/combination_embeddings.py index 24712bae251b5673bdcd07ada9dfb9a3b07b55a0..599ad1a8993547b7b17b88e4d4348aafbb728c2d 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -70,7 +70,7 @@ logging.basicConfig( ) args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args()#("-i -e 5 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) + .parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) # ################################################# @@ -205,11 +205,10 @@ if args.wikipedia_cooc: cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py index 6bec212c0d54eabba808c0fe2c0a9f7783659e42..dcde16a21e388f460a4f91d31cd08c7c62c97353 100644 --- a/combination_embeddingsv2.py +++ b/combination_embeddingsv2.py @@ -3,16 +3,21 @@ import os #Â Structure import pandas as pd + #Â DEEPL module from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM from keras.models import Model from keras.callbacks import ModelCheckpoint +from tensorflow.keras.layers import Lambda +import keras.backend as K +import tensorflow as tf #Â Custom module from lib.ngram_index import NgramIndex from lib.utils import ConfigurationReader, MetaDataSerializer from lib.metrics import lat_accuracy,lon_accuracy from data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency +from lib.geo import haversine_tf,accuracy_k # Logging import logging @@ -29,19 +34,22 @@ logging.basicConfig( ) args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-w -e 100 ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) + .parse_args("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) # ################################################# ############# MODEL TRAINING PARAMETER ########## ################################################# NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.tolerance_value +ACCURACY_TOLERANCE = args.k_value EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 100 +ADJACENCY_SAMPLING = args.adjacency_sample +COOC_SAMPLING = args.cooc_sample +WORDVEC_ITER = 50 +EMBEDDING_DIM = args.dimension +BATCH_SIZE = args.batch_size ################################################# ########## FILENAME VARIABLE #################### ################################################# @@ -49,22 +57,17 @@ EMBEDDING_DIM = 100 if not os.path.exists("outputs/"): os.makedirs("outputs/") -GEONAME_FN = "ALL"#args.geoname_input -DATASET_NAME = "ALL"#args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = ""#args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.inclusion_fn +ADJACENCY_REL_FILENAME = args.adjacency_fn COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( + +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format( GEONAME_FN.split("/")[-1], EPOCHS, NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) + ACCURACY_TOLERANCE) REL_CODE="" if args.adjacency: @@ -85,8 +88,8 @@ HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) meta_data = MetaDataSerializer( DATASET_NAME, REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, + COOC_SAMPLING, + ADJACENCY_SAMPLING, NGRAM_SIZE, ACCURACY_TOLERANCE, EPOCHS, @@ -103,16 +106,30 @@ meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) index = NgramIndex.load(args.ngram_index_fn) -#c_train = CoOccurrences(COOC_FN + "_train.csv",sampling=1) -#c_test = CoOccurrences(COOC_FN + "_test.csv",sampling=1) +train_src = [] +test_src = [] + +if args.wikipedia_cooc: + train_src.append(CoOccurrences(COOC_FN + "_train.csv",sampling=4)) + test_src.append(CoOccurrences(COOC_FN + "_test.csv",sampling=4)) + +if args.adjacency: + a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + train_src.append(a_train) + test_src.append(a_test) + +if args.inclusion: + i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") + i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") + train_src.append(i_train) + test_src.append(i_test) #Adjacency -a_train = Adjacency(COOC_FN + "_train.csv","../data/geonamesData/allCountries.txt",sampling=1,gzip=False) -a_test = Adjacency(COOC_FN + "_test.csv","../data/geonamesData/allCountries.txt",sampling=1,gzip=False) -BATCH_SIZE = 100 -d_train = DataGenerator([a_train],index,batch_size=BATCH_SIZE) -d_test = DataGenerator([a_test],index,batch_size=BATCH_SIZE) + +d_train = DataGenerator(train_src,index,batch_size=BATCH_SIZE) +d_test = DataGenerator(test_src,index,batch_size=BATCH_SIZE) num_words = len(index.index_ngram) @@ -147,30 +164,43 @@ x1 = Dense(500, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x) -# x1 = Dropout(0.3)(x1) +x1 = Dropout(0.3)(x1) x1 = Dense(500, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x1) -# x1 = Dropout(0.3)(x1) +x1 = Dropout(0.3)(x1) x2 = Dense(500, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x) -# x2 = Dropout(0.3)(x2) +x2 = Dropout(0.3)(x2) x2 = Dense(500, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x2) -# x2 = Dropout(0.3)(x2) +x2 = Dropout(0.3)(x2) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) +from keras.layers import Lambda + +def to_wgs84_lat(lat): + return ((lat*180)-90) +def to_wgs84_lon(lon): + return ((lon*360)-180) + +#output_lon = Lambda(to_wgs84_lon)(output_lon) +#output_lat = Lambda(to_wgs84_lat)(output_lat) Still between 0 and 1 to avoid loss value explosion + + + +output = concatenate([output_lon,output_lat],name="output_layer") -model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 +model = Model(inputs = [input_1,input_2], outputs = output)#input_3 -model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='rmsprop',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) +model.compile(loss=haversine_tf, optimizer='adam',metrics=[accuracy_k(ACCURACY_TOLERANCE)]) ############################################################################################# diff --git a/data_generator.py b/data_generator.py index 02e246117e02f2870f09189e435e91987e43888d..12d38bb17c42c3c6e469359018bc355fb11d17cb 100644 --- a/data_generator.py +++ b/data_generator.py @@ -295,7 +295,8 @@ class DataGenerator(keras.utils.Sequence): X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] y[i] = [*zero_one_encoding(longitude,latitude)] - return [X[:,0],X[:,1]], [y[:,0],y[:,1]] + #y[i] = [longitude,latitude] + return [X[:,0],X[:,1]], y#[y[:,0],y[:,1]] def on_epoch_end(self): 'Updates indexes after each epoch' diff --git a/helpers.py b/helpers.py index d665f8f8f7419d9dbb02557b8dc360bf6dcf86da..b093f62f25a9dce7913cca5373141f13f1753554 100644 --- a/helpers.py +++ b/helpers.py @@ -90,7 +90,7 @@ def parse_title_wiki(title_wiki): str parsed wikipedia title """ - return re.sub("\(.*\)", "", title_wiki).strip().lower() + return re.sub("\(.*\)", "", str(title_wiki)).strip().lower() def _split(lst, n, complete_chunk_value): diff --git a/lib/geo.py b/lib/geo.py index 5b809bb8708ed2f6dc942c14b594f5abf243fe1a..6841247eaf0b4d1137d73538dbf0359ff1f9dcc1 100644 --- a/lib/geo.py +++ b/lib/geo.py @@ -14,7 +14,54 @@ from helpers import read_geonames from tqdm import tqdm from joblib import Parallel,delayed +import tensorflow as tf +import keras.backend as K +def tf_deg2rad(deg): + pi_on_180 = 0.017453292519943295 + return deg * pi_on_180 + +def haversine_tf(y_true,y_pred): + """ + Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates + + Parameters + ---------- + lon1 : numeric or array-like (pandas Dataframe works also) + longitude of first coordinates + lat1 : numeric or array-like (pandas Dataframe works also) + latitude of first coordinates + lon2 : numeric or array-like (pandas Dataframe works also) + longitude of second coordinates + lat2 : numeric or array-like (pandas Dataframe works also) + longitude of second coordinates + + Returns + ------- + float or array-like + distance(s) value(s) + """ + lon1, lat1, lon2, lat2 = map(tf_deg2rad, [y_true[:,0], y_true[:,1], y_pred[:,0], y_pred[:,1]]) + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = K.sin(dlat/2.0)**2 + K.cos(lat1) * K.cos(lat2) * K.sin(dlon/2.0)**2 + + return 6367 * 2 * tf.math.asin(K.sqrt(a)) + +def to_wgs84_lat(lat): + return ((lat*180)-90) +def to_wgs84_lon(lon): + return ((lon*360)-180) + +def to_wgs84(x): + lon=to_wgs84_lon(x[:,0]) + lat=to_wgs84_lat(x[:,1]) + return tf.stack([lon,lat],axis=1) + +def accuracy_k(k=100):#km + def compute_metric(y_true,y_pred): + return K.less_equal(haversine_tf(to_wgs84(y_true),to_wgs84(y_pred)),k) + return compute_metric def haversine_pd(lon1, lat1, lon2, lat2): """ diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index 13e622b173e2018ab4f7c8f5df1994b8295ffe8d..a7dc96c78f74d703cad17f6c64f4e4a90c97dfa9 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -8,7 +8,7 @@ { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, - { "long": "--cooc-sample-size", "type": "int", "default": 3 }, + { "long": "--cooc-sample-size", "type": "int", "default": 1 }, {"long": "--adjacency-iteration", "type":"int","default":1}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json index 050c8446e7d885c60706f05bfdbfe4a19029c0a3..f0fb1fd6ceee135e98990ab6c71662e892f1dc06 100644 --- a/parser_config/toponym_combination_embedding_v2.json +++ b/parser_config/toponym_combination_embedding_v2.json @@ -1,21 +1,22 @@ { "description": "Toponym Combination", "args": [ + { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, { "short": "ngram_index_fn", "help": "Filepath of the NgramIndex file you want to use." }, { "short": "embedding_fn", "help": "Filepath of the Embedding file you want to use." }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, + { "short": "-d", "long": "--dimension", "type": "int", "default": 100 }, { "short": "-v", "long": "--verbose", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, + { "long": "--inclusion-fn","help":"Cooccurrence data filename"}, { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, { "long": "--adjacency-fn","help":"Adjacency data filename"}, - { "long": "--cooc-sample-size", "type": "int", "default": 3 }, - {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, - { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, - { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, + { "long": "--cooc-sample", "type": "int", "default": 3 }, + {"long": "--adjacency-sample", "type":"int","default":1}, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, - { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "long": "--admin_code_1", "default": "None" } + { "short": "-b", "long": "--batch-size", "type": "int", "default": 100 }, + { "short": "-k", "long": "--k-value", "type": "float", "default": 100 ,"help":"Used for the accuracy@k metrics. Given in kilometers"} ] } \ No newline at end of file