diff --git a/combination_embeddingsv3.py b/combination_embeddingsv3.py index 86e4fcb0593c3f5c239ef404681c56f2da756817..b1d3745490405c7bf491cc574c0fa7a5935278a3 100644 --- a/combination_embeddingsv3.py +++ b/combination_embeddingsv3.py @@ -72,7 +72,7 @@ logging.basicConfig( ) args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) + .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) # ################################################# diff --git a/combination_embeddingsv3inverse.py b/combination_embeddingsv3inverse.py new file mode 100644 index 0000000000000000000000000000000000000000..fe9940fa145eb65f4d1f7f7981c96c6a6bda48b3 --- /dev/null +++ b/combination_embeddingsv3inverse.py @@ -0,0 +1,401 @@ +# Base module +import re +import os +import json + +# Structure +import pandas as pd +import numpy as np +import geopandas as gpd + +# DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout +from keras.models import Model +from keras import backend as K +from keras.callbacks import ModelCheckpoint + +import tensorflow as tf + +# Geometry +from shapely.geometry import Point + +# Custom module +from helpers import read_geonames +from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader +from lib.metrics import lat_accuracy,lon_accuracy +from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle + + +# Logging +from tqdm import tqdm +import logging +from helpers import parse_title_wiki,EpochTimer + +logging.getLogger('gensim').setLevel(logging.WARNING) + +def get_new_ids(cooc_data,id_first_value): + """ + Return new ids from cooccurrence data + + Parameters + ---------- + cooc_data : pd.DataFrame + cooccurrence da + id_first_value : int + id beginning value + + Returns + ------- + dict + new ids for each toponyms + """ + topo_id = {} + id_ = id_first_value + for title in cooc_data.title.values: + if not title in topo_id: + id_+=1 + topo_id[id_]=title + for interlinks in cooc_data.interlinks.values: + for interlink in interlinks.split("|"): + if not interlink in topo_id: + id_+=1 + topo_id[id_]=interlink + return topo_id + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ + .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +MODEL_NAME = "Bi-LSTM_NGRAM" +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = 50#args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = args.cooc_sample_size +WORDVEC_ITER = args.ngram_word2vec_iter +EMBEDDING_DIM = 256 +################################################# +########## FILENAME VARIABLE #################### +################################################# +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 +ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( + GEONAME_FN, + ITER_ADJACENCY, + REGION_SUFFIX_FN) + +COOC_FN = args.wikipedia_cooc_fn +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + REGION_SUFFIX_FN) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + +from lib.utils import MetaDataSerializer + +meta_data = MetaDataSerializer( + MODEL_NAME, + DATASET_NAME, + REL_CODE, + COOC_SAMPLING_NUMBER, + ITER_ADJACENCY, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + +############################################################################################# +################################# LOAD DATA ################################################# +############################################################################################# + +# LOAD Geonames DATA +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") + +train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) +test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) + +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places +#CLEAR RAM +del geoname_data + + +# IF REGION +if args.admin_code_1 != "None": + filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() + +# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS +filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD + + + +############################################################################################# +################################# RETRIEVE RELATIONSHIPS #################################### +############################################################################################# + + +# INITIALIZE RELATION STORE +rel_store = [] + +# Retrieve adjacency relationships +if args.adjacency: + logging.info("Retrieve adjacency relationships ! ") + + if not os.path.exists(ADJACENCY_REL_FILENAME): + bounds = get_bounds(filtered) # Required to get adjacency relationships + rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) + json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) + else: + logging.info("Open and load data from previous computation!") + rel_store=json.load(open(ADJACENCY_REL_FILENAME)) + + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) + +# Retrieve inclusion relationships +if args.inclusion: + logging.info("Retrieve inclusion relationships ! ") + + cpt_rel = len(rel_store) + rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) + + logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) + + + +if args.wikipedia_cooc: + logging.info("Load Wikipedia Cooccurrence data and merge with geonames") + + cooc_data = pd.read_csv(COOC_FN,sep="\t") + cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) + cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) + id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) + wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} + cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) + filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") + if not "title" in train_cooc_indices: + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + + logging.info("Merged with Geonames data !") + + # EXTRACT rel + logging.info("Extracting cooccurrence relationships") + cpt=0 + for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): + for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): + cpt+=1 + rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) + logging.info("Extract {0} cooccurrence relationships !".format(cpt)) + + +# STORE ID to name +geoname2name = dict(filtered["geonameid name".split()].values) + +# ENCODING NAME USING N-GRAM SPLITTING +logging.info("Encoding toponyms to ngram...") +index = NgramIndex(NGRAM_SIZE) + + # Identify all ngram available +filtered.name.apply(lambda x : index.split_and_add(x)) +if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] + +geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association + +if args.wikipedia_cooc: + geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) + +# SAVE THE INDEX TO REUSE THE MODEL +index.save(INDEX_FN) + +logging.info("Done !") + + +############################################################################################# +################################# ENCODE COORDINATES ######################################## +############################################################################################# + + + +# Encode each geonames entry coordinates +geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} +# CLEAR RAM +del filtered + + +EMBEDDING_DIM = 256 +num_words = len(index.index_ngram) # necessary for the embedding matrix + +logging.info("Preparing Input and Output data...") + + +############################################################################################# +################################# BUILD TRAIN/TEST DATASETS ################################# +############################################################################################# + +X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] +X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] +y_train,y_test = [],[] + +for couple in rel_store: + geonameId_1,geonameId_2 = couple[0],couple[1] + if not geonameId_1 in geoname2encodedname: + continue + top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] + if geonameId_1 in train_indices: #and geonameId_2 in train_indices: + + X_1_train.append(top1) + X_2_train.append(top2) + + y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) + #y_lon_train.append(geoname_vec[geonameId_1][0]) + #y_lat_train.append(geoname_vec[geonameId_1][1]) + + else: + X_1_test.append(top2) + X_2_test.append(top1) + + y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) + #y_lon_test.append(geoname_vec[geonameId_1][0]) + #y_lat_test.append(geoname_vec[geonameId_1][1]) + +# NUMPYZE inputs and output lists +X_1_train = np.array(X_1_train) +X_2_train = np.array(X_2_train) +y_lat_train = np.array(y_lat_train) +y_lon_train = np.array(y_lon_train) +y_train = np.array(y_train) + +X_1_test = np.array(X_1_test) +X_2_test = np.array(X_2_test) +y_lat_test = np.array(y_lat_test) +y_lon_test = np.array(y_lon_test) +y_test = np.array(y_test) + +logging.info("Data prepared !") + + +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + + +logging.info("Generating N-GRAM Embedding...") +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) +logging.info("Embedding generated !") + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +# Each LSTM learn on a permutation of the input toponyms +x1 = Bidirectional(LSTM(98))(x1) +x2 = Bidirectional(LSTM(98))(x2) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +# x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +# x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +# x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +# x2 = Dropout(0.3)(x2) + +output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) +output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + +output_coord = concatenate([output_lon,output_lat],name="output_coord") + +model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 + +model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) + +# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + +# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) + + +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") + + +history = model.fit(x=[X_1_train,X_2_train], + y=y_train,#[y_lon_train,y_lat_train], + verbose=True, batch_size=100, + epochs=EPOCHS, + validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), + callbacks=[checkpoint,epoch_timer]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +# Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + import shutil + shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/combination_embeddingsv4.py b/combination_embeddingsv4.py new file mode 100644 index 0000000000000000000000000000000000000000..b1d3745490405c7bf491cc574c0fa7a5935278a3 --- /dev/null +++ b/combination_embeddingsv4.py @@ -0,0 +1,401 @@ +# Base module +import re +import os +import json + +# Structure +import pandas as pd +import numpy as np +import geopandas as gpd + +# DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout +from keras.models import Model +from keras import backend as K +from keras.callbacks import ModelCheckpoint + +import tensorflow as tf + +# Geometry +from shapely.geometry import Point + +# Custom module +from helpers import read_geonames +from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader +from lib.metrics import lat_accuracy,lon_accuracy +from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle + + +# Logging +from tqdm import tqdm +import logging +from helpers import parse_title_wiki,EpochTimer + +logging.getLogger('gensim').setLevel(logging.WARNING) + +def get_new_ids(cooc_data,id_first_value): + """ + Return new ids from cooccurrence data + + Parameters + ---------- + cooc_data : pd.DataFrame + cooccurrence da + id_first_value : int + id beginning value + + Returns + ------- + dict + new ids for each toponyms + """ + topo_id = {} + id_ = id_first_value + for title in cooc_data.title.values: + if not title in topo_id: + id_+=1 + topo_id[id_]=title + for interlinks in cooc_data.interlinks.values: + for interlink in interlinks.split("|"): + if not interlink in topo_id: + id_+=1 + topo_id[id_]=interlink + return topo_id + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ + .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +MODEL_NAME = "Bi-LSTM_NGRAM" +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = args.cooc_sample_size +WORDVEC_ITER = args.ngram_word2vec_iter +EMBEDDING_DIM = 256 +################################################# +########## FILENAME VARIABLE #################### +################################################# +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 +ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( + GEONAME_FN, + ITER_ADJACENCY, + REGION_SUFFIX_FN) + +COOC_FN = args.wikipedia_cooc_fn +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + REGION_SUFFIX_FN) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + +from lib.utils import MetaDataSerializer + +meta_data = MetaDataSerializer( + MODEL_NAME, + DATASET_NAME, + REL_CODE, + COOC_SAMPLING_NUMBER, + ITER_ADJACENCY, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + +############################################################################################# +################################# LOAD DATA ################################################# +############################################################################################# + +# LOAD Geonames DATA +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") + +train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) +test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) + +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places +#CLEAR RAM +del geoname_data + + +# IF REGION +if args.admin_code_1 != "None": + filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() + +# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS +filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD + + + +############################################################################################# +################################# RETRIEVE RELATIONSHIPS #################################### +############################################################################################# + + +# INITIALIZE RELATION STORE +rel_store = [] + +# Retrieve adjacency relationships +if args.adjacency: + logging.info("Retrieve adjacency relationships ! ") + + if not os.path.exists(ADJACENCY_REL_FILENAME): + bounds = get_bounds(filtered) # Required to get adjacency relationships + rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) + json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) + else: + logging.info("Open and load data from previous computation!") + rel_store=json.load(open(ADJACENCY_REL_FILENAME)) + + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) + +# Retrieve inclusion relationships +if args.inclusion: + logging.info("Retrieve inclusion relationships ! ") + + cpt_rel = len(rel_store) + rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) + + logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) + + + +if args.wikipedia_cooc: + logging.info("Load Wikipedia Cooccurrence data and merge with geonames") + + cooc_data = pd.read_csv(COOC_FN,sep="\t") + cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) + cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) + id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) + wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} + cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) + filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") + if not "title" in train_cooc_indices: + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + + logging.info("Merged with Geonames data !") + + # EXTRACT rel + logging.info("Extracting cooccurrence relationships") + cpt=0 + for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): + for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): + cpt+=1 + rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) + logging.info("Extract {0} cooccurrence relationships !".format(cpt)) + + +# STORE ID to name +geoname2name = dict(filtered["geonameid name".split()].values) + +# ENCODING NAME USING N-GRAM SPLITTING +logging.info("Encoding toponyms to ngram...") +index = NgramIndex(NGRAM_SIZE) + + # Identify all ngram available +filtered.name.apply(lambda x : index.split_and_add(x)) +if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] + +geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association + +if args.wikipedia_cooc: + geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) + +# SAVE THE INDEX TO REUSE THE MODEL +index.save(INDEX_FN) + +logging.info("Done !") + + +############################################################################################# +################################# ENCODE COORDINATES ######################################## +############################################################################################# + + + +# Encode each geonames entry coordinates +geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} +# CLEAR RAM +del filtered + + +EMBEDDING_DIM = 256 +num_words = len(index.index_ngram) # necessary for the embedding matrix + +logging.info("Preparing Input and Output data...") + + +############################################################################################# +################################# BUILD TRAIN/TEST DATASETS ################################# +############################################################################################# + +X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] +X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] +y_train,y_test = [],[] + +for couple in rel_store: + geonameId_1,geonameId_2 = couple[0],couple[1] + if not geonameId_1 in geoname2encodedname: + continue + top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] + if geonameId_1 in train_indices: #and geonameId_2 in train_indices: + + X_1_train.append(top1) + X_2_train.append(top2) + + y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) + #y_lon_train.append(geoname_vec[geonameId_1][0]) + #y_lat_train.append(geoname_vec[geonameId_1][1]) + + else: + X_1_test.append(top1) + X_2_test.append(top2) + + y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) + #y_lon_test.append(geoname_vec[geonameId_1][0]) + #y_lat_test.append(geoname_vec[geonameId_1][1]) + +# NUMPYZE inputs and output lists +X_1_train = np.array(X_1_train) +X_2_train = np.array(X_2_train) +y_lat_train = np.array(y_lat_train) +y_lon_train = np.array(y_lon_train) +y_train = np.array(y_train) + +X_1_test = np.array(X_1_test) +X_2_test = np.array(X_2_test) +y_lat_test = np.array(y_lat_test) +y_lon_test = np.array(y_lon_test) +y_test = np.array(y_test) + +logging.info("Data prepared !") + + +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + + +logging.info("Generating N-GRAM Embedding...") +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) +logging.info("Embedding generated !") + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +# Each LSTM learn on a permutation of the input toponyms +x1 = Bidirectional(LSTM(98))(x1) +x2 = Bidirectional(LSTM(98))(x2) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +# x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +# x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +# x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +# x2 = Dropout(0.3)(x2) + +output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) +output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + +output_coord = concatenate([output_lon,output_lat],name="output_coord") + +model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 + +model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) + +# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + +# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) + + +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") + + +history = model.fit(x=[X_1_train,X_2_train], + y=y_train,#[y_lon_train,y_lat_train], + verbose=True, batch_size=100, + epochs=EPOCHS, + validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), + callbacks=[checkpoint,epoch_timer]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +# Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + import shutil + shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/run_train.py b/run_train.py index 057b91df9ab8162bf47f19e377e2678a9d3e795b..49753c5308bb6519a631a3fc7d578480571bf45b 100644 --- a/run_train.py +++ b/run_train.py @@ -4,7 +4,7 @@ c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt" # Init GridsearchModel grid = GridSearchModel(\ - "python3 combination_embeddingsv3.py", + "python3 combination_embeddingsv3inverse.py", **OrderedDict({ # necessary because some args have to be given in a certain order "rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a" "-n":[4], diff --git a/scripts/randoludo.py b/scripts/randoludo.py index 7428edeaf51ef4c409a0da3a8a994f20e420b3f6..f1826b534c6d246042dbf0b04442e369746c9b77 100644 --- a/scripts/randoludo.py +++ b/scripts/randoludo.py @@ -1,6 +1,10 @@ +import pandas as pd +import numpy as np + from lib.geocoder import Geocoder + geocoder = Geocoder("./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C.h5","./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C_index") -import pandas as pd + df = pd.read_csv("data/rando_toponymes.tsv",sep="\t") df["name"]=df.name.apply(lambda x:x.split("¦")[0]) diff --git a/templates/pair_topo.html b/templates/pair_topo.html index 0a03639deeb0cd2178ddd16239a923e880ee4e2d..3366643bf04e1fe1f04333ab99749d021094ac6a 100644 --- a/templates/pair_topo.html +++ b/templates/pair_topo.html @@ -43,6 +43,12 @@ }).addTo(mymap); {% if lat and lon %} var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); + var circle = L.circle([{{lat}}, {{lon}}], { + color: "red", + fillColor: "#f03", + fillOpacity: 0.5, + radius: 100000.0 + }).addTo(mymap); {% endif %} </script> diff --git a/templates/text.html b/templates/text.html index ef3ba60bf78b920bc4d519ca01c6abc441eeae9c..677b26e94220b349f834c331c5a92a5ac6dee561 100644 --- a/templates/text.html +++ b/templates/text.html @@ -46,6 +46,12 @@ var mark = L.marker([{{coords["lat"]}}, {{coords["lon"]}}],); mark.bindPopup("{{place}}") mark.addTo(mymap); + var circle = L.circle([{{coords["lat"]}}, {{coords["lon"]}}], { + color: "red", + fillColor: "#f03", + fillOpacity: 0.5, + radius: 100000.0 + }).addTo(mymap); {% endfor %} </script>