diff --git a/combination_embeddings.py b/combination_embeddings.py deleted file mode 100644 index dd66e51c23453c8cd1acab2d60e7de5913d4e92c..0000000000000000000000000000000000000000 --- a/combination_embeddings.py +++ /dev/null @@ -1,392 +0,0 @@ -# Base module -import re -import os -import json - -#Â Structure -import pandas as pd -import numpy as np -import geopandas as gpd - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout -from keras.models import Model -from keras import backend as K -from keras.callbacks import ModelCheckpoint - -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - -#Â Custom module -from helpers import read_geonames -from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds -from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader -from lib.metrics import lat_accuracy,lon_accuracy -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle - - -# Logging -from tqdm import tqdm -import logging -from helpers import parse_title_wiki,EpochTimer - -logging.getLogger('gensim').setLevel(logging.WARNING) - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -MODEL_NAME = "Bi-LSTM_NGRAM" -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.tolerance_value -EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 256 -################################################# -########## FILENAME VARIABLE #################### -################################################# -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - -COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - -from lib.utils import MetaDataSerializer - -meta_data = MetaDataSerializer( - MODEL_NAME, - DATASET_NAME, - REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - -############################################################################################# -################################# LOAD DATA ################################################# -############################################################################################# - -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") - -train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) -test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) - -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -#CLEAR RAM -del geoname_data - - -# IF REGION -if args.admin_code_1 != "None": - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() - -# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD - - - -############################################################################################# -################################# RETRIEVE RELATIONSHIPS #################################### -############################################################################################# - - -# INITIALIZE RELATION STORE -rel_store = [] - -# Retrieve adjacency relationships -if args.adjacency: - logging.info("Retrieve adjacency relationships ! ") - - if not os.path.exists(ADJACENCY_REL_FILENAME): - bounds = get_bounds(filtered) # Required to get adjacency relationships - rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) - json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=json.load(open(ADJACENCY_REL_FILENAME)) - - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -# Retrieve inclusion relationships -if args.inclusion: - logging.info("Retrieve inclusion relationships ! ") - - cpt_rel = len(rel_store) - rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) - - logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) - - - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") - if not "title" in train_cooc_indices: - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - -# ENCODING NAME USING N-GRAM SPLITTING -logging.info("Encoding toponyms to ngram...") -index = NgramIndex(NGRAM_SIZE) - - # Identify all ngram available -filtered.name.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] - -geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association - -if args.wikipedia_cooc: - geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) - -# SAVE THE INDEX TO REUSE THE MODEL -index.save(INDEX_FN) - -logging.info("Done !") - - -############################################################################################# -################################# ENCODE COORDINATES ######################################## -############################################################################################# - - - -# Encode each geonames entry coordinates -geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} -# CLEAR RAM -del filtered - - -EMBEDDING_DIM = 256 -num_words = len(index.index_ngram) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - - -############################################################################################# -################################# BUILD TRAIN/TEST DATASETS ################################# -############################################################################################# - -X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] -X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] - -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_1_train.append(top1) - X_2_train.append(top2) - - y_lon_train.append(geoname_vec[geonameId_1][0]) - y_lat_train.append(geoname_vec[geonameId_1][1]) - - else: - X_1_test.append(top1) - X_2_test.append(top2) - - y_lon_test.append(geoname_vec[geonameId_1][0]) - y_lat_test.append(geoname_vec[geonameId_1][1]) - -# NUMPYZE inputs and output lists -X_1_train = np.array(X_1_train) -X_2_train = np.array(X_2_train) -y_lat_train = np.array(y_lat_train) -y_lon_train = np.array(y_lon_train) - -X_1_test = np.array(X_1_test) -X_2_test = np.array(X_2_test) -y_lat_test = np.array(y_lat_test) -y_lon_test = np.array(y_lon_test) - -logging.info("Data prepared !") - - -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - - -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) -logging.info("Embedding generated !") - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -#Â Each LSTM learn on a permutation of the input toponyms -x1 = Bidirectional(LSTM(98))(x1) -x2 = Bidirectional(LSTM(98))(x2) - -x = concatenate([x1,x2])#,x3]) - -x1 = Dense(500,activation="relu")(x) -# x1 = Dropout(0.3)(x1) -x1 = Dense(500,activation="relu")(x1) -# x1 = Dropout(0.3)(x1) - -x2 = Dense(500,activation="relu")(x) -# x2 = Dropout(0.3)(x2) -x2 = Dense(500,activation="relu")(x2) -# x2 = Dropout(0.3)(x2) - -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - - - -model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - -model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) - - -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - - -history = model.fit(x=[X_1_train,X_2_train], - y=[y_lon_train,y_lat_train], - verbose=True, batch_size=100, - epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]), - callbacks=[checkpoint,epoch_timer]) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - import shutil - shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/combination_embeddings_baselines.py b/combination_embeddings_baselines.py deleted file mode 100644 index b4c495b280153a6b3b2c8e93c6e2509cd24504b1..0000000000000000000000000000000000000000 --- a/combination_embeddings_baselines.py +++ /dev/null @@ -1,360 +0,0 @@ -# Base module -import re -import os -import json - -#Â Structure -import pandas as pd -import numpy as np -import geopandas as gpd - -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - -#Â Custom module -from helpers import read_geonames -from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds -from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader -from lib.metrics import lat_accuracy,lon_accuracy - -# Logging -from tqdm import tqdm -import logging -from helpers import parse_title_wiki,EpochTimer - -logging.getLogger('gensim').setLevel(logging.WARNING) - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args()#("-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -n 4 --ngram-word2vec-iter 10 -e 100 ../data/geonamesData/US_FR.txt ../data/geonamesData/hierarchy.txt".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -MODEL_NAME = "BASELINE" -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.tolerance_value -EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 256 -################################################# -########## FILENAME VARIABLE #################### -################################################# -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - -COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - -from lib.utils import MetaDataSerializer - -meta_data = MetaDataSerializer( - MODEL_NAME, - DATASET_NAME, - REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) -print(REL_CODE) - -############################################################################################# -################################# LOAD DATA ################################################# -############################################################################################# - -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") - -train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) -test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) - -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -#CLEAR RAM -del geoname_data - - -# IF REGION -if args.admin_code_1 != "None": - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() - -# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD - - - -############################################################################################# -################################# RETRIEVE RELATIONSHIPS #################################### -############################################################################################# - - -# INITIALIZE RELATION STORE -rel_store = [] - -# Retrieve adjacency relationships -if args.adjacency: - logging.info("Retrieve adjacency relationships ! ") - - if not os.path.exists(ADJACENCY_REL_FILENAME): - bounds = get_bounds(filtered) # Required to get adjacency relationships - rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) - json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=json.load(open(ADJACENCY_REL_FILENAME)) - - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -# Retrieve inclusion relationships -if args.inclusion: - logging.info("Retrieve inclusion relationships ! ") - - cpt_rel = len(rel_store) - rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) - - logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) - - - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") - if not "title" in train_cooc_indices: - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - -# ENCODING NAME USING N-GRAM SPLITTING -logging.info("Encoding toponyms to ngram...") -index = NgramIndex(NGRAM_SIZE) - - # Identify all ngram available -filtered.name.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] - -geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association - -if args.wikipedia_cooc: - geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) - -# SAVE THE INDEX TO REUSE THE MODEL -index.save(INDEX_FN) - -logging.info("Done !") - - -############################################################################################# -################################# ENCODE COORDINATES ######################################## -############################################################################################# - -from lib.geo import latlon2healpix - -# Encode each geonames entry coordinates -geoname_vec = {row.geonameid : latlon2healpix(row.latitude,row.longitude,128) for row in filtered.itertuples()} -# CLEAR RAM -del filtered - - -EMBEDDING_DIM = 256 -num_words = len(index.index_ngram) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - - -############################################################################################# -################################# BUILD TRAIN/TEST DATASETS ################################# -############################################################################################# - -X_train,y_train = [],[] -X_test,y_test = [],[] - -from joblib import Parallel,delayed -from tensorflow.keras.utils import to_categorical - -def parse_bow(x): - return np.sum(to_categorical(x,num_classes=index.cpt+1),axis=0) - -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_train.append(top1 + top2) - y_train.append(geoname_vec[geonameId_1]) - - else: - X_test.append(top1 + top2) - y_test.append(geoname_vec[geonameId_1]) - -# NUMPYZE inputs and output lists -X_train = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_train)) -X_train = np.array(X_train) -y_train = np.array(y_train) - -X_test = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_test)) -X_test = np.array(X_test) -y_test = np.array(y_test) - -logging.info("Data prepared !") - - -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -from scipy.sparse import csr_matrix -from sklearn import svm -from sklearn.naive_bayes import GaussianNB,MultinomialNB -from sklearn.metrics import classification_report -from sklearn import tree -from sklearn.ensemble import RandomForestClassifier - -X_train = csr_matrix(X_train) -X_test = csr_matrix(X_test) - -print(REL_CODE) -oupt = open("log_baseline_US_FR_{0}.txt".format(REL_CODE),'a') -oupt.write("------") - -from joblib import dump -import sys -f=True - -for kernel in ["rbf","linear","poly"]: - clf = svm.SVC(kernel=kernel) - clf.fit(X_train,y_train) - if kernel =="linear" and f: - dump(clf,"SVMLINEAR_US_FR_{0}.bin".format(REL_CODE)) - sys.exit() - y_pred = clf.predict(X_test) - oupt.write("Results for : "+"SVM with the kernel "+kernel) - oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])) - oupt.flush() - -for alg in (GaussianNB,MultinomialNB): - clf = alg() - clf.fit(X_train.toarray(),y_train) - y_pred = clf.predict(X_test.toarray()) - oupt.write("Results for : "+"NaiveBayes with the alg "+alg.__name__) - oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])+"\n") - oupt.flush() - -clf = tree.DecisionTreeClassifier() -clf.fit(X_train,y_train) -y_pred = clf.predict(X_test) -oupt.write("Results for : "+"Decision Tree classifier") -oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])) -oupt.flush() - -clf = RandomForestClassifier(max_depth=8, random_state=0) -clf.fit(X_train,y_train) -y_pred = clf.predict(X_test) -oupt.write("Results for : "+"Random Forest classifier") -oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])) -oupt.flush() - -oupt.close() \ No newline at end of file diff --git a/combination_embeddings_word.py b/combination_embeddings_word.py deleted file mode 100644 index 762780d19ef97e260ead434196fad1e302787a70..0000000000000000000000000000000000000000 --- a/combination_embeddings_word.py +++ /dev/null @@ -1,390 +0,0 @@ -# Base module -import re -import os -import json - -#Â Structure -import pandas as pd -import numpy as np -import geopandas as gpd - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout -from keras.models import Model -from keras import backend as K -from keras.callbacks import ModelCheckpoint - -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - -#Â Custom module -from helpers import read_geonames -from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds -from lib.ngram_index import NgramIndex -from lib.word_index import WordIndex -from lib.utils import ConfigurationReader -from lib.metrics import lat_accuracy,lon_accuracy - -# Logging -from tqdm import tqdm -import logging -from helpers import parse_title_wiki,EpochTimer - -logging.getLogger('gensim').setLevel(logging.WARNING) - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -MODEL_NAME = "Bi-LSTM_WORD" -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.tolerance_value -EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 256 - - -################################################# -########## FILENAME VARIABLE #################### -################################################# - -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - -COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}_{5}".format(MODEL_NAME, - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - -from lib.utils import MetaDataSerializer - -meta_data = MetaDataSerializer( - MODEL_NAME, - DATASET_NAME, - REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - -############################################################################################# -################################# LOAD DATA ################################################# -############################################################################################# - -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") - -train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) -test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) - -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -#CLEAR RAM -del geoname_data - - -# IF REGION -if args.admin_code_1 != "None": - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() - -# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD - - - -############################################################################################# -################################# RETRIEVE RELATIONSHIPS #################################### -############################################################################################# - - -# INITIALIZE RELATION STORE -rel_store = [] - -# Retrieve adjacency relationships -if args.adjacency: - logging.info("Retrieve adjacency relationships ! ") - - if not os.path.exists(ADJACENCY_REL_FILENAME): - bounds = get_bounds(filtered) # Required to get adjacency relationships - rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) - json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=json.load(open(ADJACENCY_REL_FILENAME)) - - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -# Retrieve inclusion relationships -if args.inclusion: - logging.info("Retrieve inclusion relationships ! ") - - cpt_rel = len(rel_store) - rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) - - logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) - - - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") - if not "title" in train_cooc_indices: - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - -# ENCODING NAME USING N-GRAM SPLITTING -logging.info("Encoding toponyms to ngram...") -index = WordIndex() - - # Identify all ngram available -filtered.name.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] - -geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association - -if args.wikipedia_cooc: - geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) - -# SAVE THE INDEX TO REUSE THE MODEL -index.save(INDEX_FN) - -logging.info("Done !") - - -############################################################################################# -################################# ENCODE COORDINATES ######################################## -############################################################################################# - - - -# Encode each geonames entry coordinates -geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} -# CLEAR RAM -del filtered - - -EMBEDDING_DIM = 256 -num_words = len(index.index_word) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - - -############################################################################################# -################################# BUILD TRAIN/TEST DATASETS ################################# -############################################################################################# - -X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] -X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] - -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_1_train.append(top1) - X_2_train.append(top2) - - y_lon_train.append(geoname_vec[geonameId_1][0]) - y_lat_train.append(geoname_vec[geonameId_1][1]) - - else: - X_1_test.append(top1) - X_2_test.append(top2) - - y_lon_test.append(geoname_vec[geonameId_1][0]) - y_lat_test.append(geoname_vec[geonameId_1][1]) - -# NUMPYZE inputs and output lists -X_1_train = np.array(X_1_train) -X_2_train = np.array(X_2_train) -y_lat_train = np.array(y_lat_train) -y_lon_train = np.array(y_lon_train) - -X_1_test = np.array(X_1_test) -X_2_test = np.array(X_2_test) -y_lat_test = np.array(y_lat_test) -y_lon_test = np.array(y_lon_test) - -logging.info("Data prepared !") - - -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - - -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) -logging.info("Embedding generated !") - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -#Â Each LSTM learn on a permutation of the input toponyms -x1 = Bidirectional(LSTM(98))(x1) -x2 = Bidirectional(LSTM(98))(x2) - -x = concatenate([x1,x2])#,x3]) - -x1 = Dense(500,activation="relu")(x) -# x1 = Dropout(0.3)(x1) -x1 = Dense(500,activation="relu")(x1) -# x1 = Dropout(0.3)(x1) - -x2 = Dense(500,activation="relu")(x) -# x2 = Dropout(0.3)(x2) -x2 = Dense(500,activation="relu")(x2) -# x2 = Dropout(0.3)(x2) - -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - -model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - -model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) - - -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - -history = model.fit(x=[X_1_train,X_2_train], - y=[y_lon_train,y_lat_train], - verbose=True, batch_size=100, - epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]), - callbacks=[checkpoint,epoch_timer]) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/combination_embeddingsv3inverse.py b/combination_embeddingsv3inverse.py deleted file mode 100644 index fe9940fa145eb65f4d1f7f7981c96c6a6bda48b3..0000000000000000000000000000000000000000 --- a/combination_embeddingsv3inverse.py +++ /dev/null @@ -1,401 +0,0 @@ -# Base module -import re -import os -import json - -#Â Structure -import pandas as pd -import numpy as np -import geopandas as gpd - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout -from keras.models import Model -from keras import backend as K -from keras.callbacks import ModelCheckpoint - -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - -#Â Custom module -from helpers import read_geonames -from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds -from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader -from lib.metrics import lat_accuracy,lon_accuracy -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle - - -# Logging -from tqdm import tqdm -import logging -from helpers import parse_title_wiki,EpochTimer - -logging.getLogger('gensim').setLevel(logging.WARNING) - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -MODEL_NAME = "Bi-LSTM_NGRAM" -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = 50#args.tolerance_value -EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 256 -################################################# -########## FILENAME VARIABLE #################### -################################################# -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - -COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - -from lib.utils import MetaDataSerializer - -meta_data = MetaDataSerializer( - MODEL_NAME, - DATASET_NAME, - REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - -############################################################################################# -################################# LOAD DATA ################################################# -############################################################################################# - -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") - -train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) -test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) - -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -#CLEAR RAM -del geoname_data - - -# IF REGION -if args.admin_code_1 != "None": - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() - -# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD - - - -############################################################################################# -################################# RETRIEVE RELATIONSHIPS #################################### -############################################################################################# - - -# INITIALIZE RELATION STORE -rel_store = [] - -# Retrieve adjacency relationships -if args.adjacency: - logging.info("Retrieve adjacency relationships ! ") - - if not os.path.exists(ADJACENCY_REL_FILENAME): - bounds = get_bounds(filtered) # Required to get adjacency relationships - rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) - json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=json.load(open(ADJACENCY_REL_FILENAME)) - - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -# Retrieve inclusion relationships -if args.inclusion: - logging.info("Retrieve inclusion relationships ! ") - - cpt_rel = len(rel_store) - rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) - - logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) - - - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") - if not "title" in train_cooc_indices: - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - -# ENCODING NAME USING N-GRAM SPLITTING -logging.info("Encoding toponyms to ngram...") -index = NgramIndex(NGRAM_SIZE) - - # Identify all ngram available -filtered.name.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] - -geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association - -if args.wikipedia_cooc: - geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) - -# SAVE THE INDEX TO REUSE THE MODEL -index.save(INDEX_FN) - -logging.info("Done !") - - -############################################################################################# -################################# ENCODE COORDINATES ######################################## -############################################################################################# - - - -# Encode each geonames entry coordinates -geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} -# CLEAR RAM -del filtered - - -EMBEDDING_DIM = 256 -num_words = len(index.index_ngram) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - - -############################################################################################# -################################# BUILD TRAIN/TEST DATASETS ################################# -############################################################################################# - -X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] -X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] -y_train,y_test = [],[] - -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_1_train.append(top1) - X_2_train.append(top2) - - y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) - #y_lon_train.append(geoname_vec[geonameId_1][0]) - #y_lat_train.append(geoname_vec[geonameId_1][1]) - - else: - X_1_test.append(top2) - X_2_test.append(top1) - - y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) - #y_lon_test.append(geoname_vec[geonameId_1][0]) - #y_lat_test.append(geoname_vec[geonameId_1][1]) - -# NUMPYZE inputs and output lists -X_1_train = np.array(X_1_train) -X_2_train = np.array(X_2_train) -y_lat_train = np.array(y_lat_train) -y_lon_train = np.array(y_lon_train) -y_train = np.array(y_train) - -X_1_test = np.array(X_1_test) -X_2_test = np.array(X_2_test) -y_lat_test = np.array(y_lat_test) -y_lon_test = np.array(y_lon_test) -y_test = np.array(y_test) - -logging.info("Data prepared !") - - -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - - -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) -logging.info("Embedding generated !") - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -#Â Each LSTM learn on a permutation of the input toponyms -x1 = Bidirectional(LSTM(98))(x1) -x2 = Bidirectional(LSTM(98))(x2) - -x = concatenate([x1,x2])#,x3]) - -x1 = Dense(500,activation="relu")(x) -# x1 = Dropout(0.3)(x1) -x1 = Dense(500,activation="relu")(x1) -# x1 = Dropout(0.3)(x1) - -x2 = Dense(500,activation="relu")(x) -# x2 = Dropout(0.3)(x2) -x2 = Dense(500,activation="relu")(x2) -# x2 = Dropout(0.3)(x2) - -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - -output_coord = concatenate([output_lon,output_lat],name="output_coord") - -model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 - -model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) - -# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - -# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) - - -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - - -history = model.fit(x=[X_1_train,X_2_train], - y=y_train,#[y_lon_train,y_lat_train], - verbose=True, batch_size=100, - epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), - callbacks=[checkpoint,epoch_timer]) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - import shutil - shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/desamb_eval.py b/desamb_eval.py index 34f04ad63ff99ef6ec0a9c9611fd1368d77a670e..8fcd5febb04f7df4938dd5634949f15fc6ac3ce0 100644 --- a/desamb_eval.py +++ b/desamb_eval.py @@ -21,7 +21,7 @@ if not args.gpu: from predict_toponym_coordinates import Geocoder -from lib.geo import haversine_pd +from lib.utils_geo import haversine_pd logging.getLogger("tensorflow").setLevel(logging.CRITICAL) logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL) diff --git a/generate_dataset.py b/generate_dataset.py index 4e873df5bede9c49f60f427fa1f681af8f0885ab..75713f0bf799bfbe2c69c44b4cda603258ba0c86 100644 --- a/generate_dataset.py +++ b/generate_dataset.py @@ -4,7 +4,7 @@ import pandas as pd import numpy as np from helpers import read_geonames -from lib.geo import latlon2healpix +from lib.utils_geo import latlon2healpix from tqdm import tqdm from sklearn.model_selection import train_test_split diff --git a/server.py b/geocoder_app.py similarity index 100% rename from server.py rename to geocoder_app.py diff --git a/lib/data_generator.py b/lib/data_generator.py index 58cbad4a3bf095e52772cbcea3168e5860d8489d..7eae387e651be140976239112403b9b1a47e855c 100644 --- a/lib/data_generator.py +++ b/lib/data_generator.py @@ -6,7 +6,7 @@ from keras.utils import to_categorical import numpy as np import pandas as pd -from .geo import zero_one_encoding +from .utils_geo import zero_one_encoding from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors diff --git a/lib/data_generatorv3.py b/lib/data_generatorv3.py index cd17cdba4fafea2ab43f49fa778d88374b66dbe9..9bd88db97206808f1bc10db7b5c9a3e182de194b 100644 --- a/lib/data_generatorv3.py +++ b/lib/data_generatorv3.py @@ -6,7 +6,7 @@ from keras.utils import to_categorical import numpy as np import pandas as pd -from .geo import zero_one_encoding +from .utils_geo import zero_one_encoding from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors diff --git a/lib/datageneratorv4.py b/lib/datageneratorv4.py index f413a63b57783bc8dcd01beffac72870c94fd320..c1093b20209f1315143d29b574fe8008f058e68b 100644 --- a/lib/datageneratorv4.py +++ b/lib/datageneratorv4.py @@ -6,7 +6,7 @@ from keras.utils import to_categorical import numpy as np import pandas as pd -from .geo import zero_one_encoding +from .utils_geo import zero_one_encoding from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors diff --git a/lib/geocoder/__init__.py b/lib/geocoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lib/bert_geocoder.py b/lib/geocoder/bert_geocoder.py similarity index 95% rename from lib/bert_geocoder.py rename to lib/geocoder/bert_geocoder.py index d2305a52e8c74ca57d86ed2db37a418165f8176e..e74b7b929b4afd984a2bee931e7ed3c7685c7a26 100644 --- a/lib/bert_geocoder.py +++ b/lib/geocoder/bert_geocoder.py @@ -20,8 +20,8 @@ from transformers import BertTokenizer from transformers import BertForSequenceClassification, AdamW, BertConfig from transformers import get_linear_schedule_with_warmup -from lib.torch_generator import SentenceDataset -from lib.geo import latlon2healpix,healpix2latlon +from ..torch_generator import SentenceDataset +from ..utils_geo import latlon2healpix,healpix2latlon import pickle diff --git a/lib/geocoder.py b/lib/geocoder/our_geocoder.py similarity index 99% rename from lib/geocoder.py rename to lib/geocoder/our_geocoder.py index 99f6a0ab7da1b25f8ff88a187555e5e72100b5f3..0cd85204968f0848febb662cdb32691bc70ac5d1 100644 --- a/lib/geocoder.py +++ b/lib/geocoder/our_geocoder.py @@ -15,7 +15,7 @@ from tensorflow.python.keras.models import load_model # CUSTOM LIB from lib.word_index import WordIndex from lib.ngram_index import NgramIndex -from lib.geo import haversine_tf_1circle +from lib.utils_geo import haversine_tf_1circle import stanza import spacy diff --git a/svm_predict_hp.py b/lib/geocoder/svm_geocoder.py similarity index 88% rename from svm_predict_hp.py rename to lib/geocoder/svm_geocoder.py index c836859e186a468189b297a1a185e2f1e387880f..2997b2966ccaec79b53849e2a4254f8625c7d873 100644 --- a/svm_predict_hp.py +++ b/lib/geocoder/svm_geocoder.py @@ -3,7 +3,7 @@ import numpy as np from joblib import dump, load from tensorflow.keras.utils import to_categorical -from lib.geo import latlon2healpix +from lib.utils_geo import latlon2healpix from lib.ngram_index import NgramIndex @@ -14,7 +14,7 @@ def is_in(lat,lon,hp_predicted,hp_nside): hp_truth = latlon2healpix(lat,lon,hp_nside) return hp_truth == hp_predicted -class HealpixGeocoder(object): +class SVMGeocoder(object): def __init__(self,model_fn,ngram_index_filename): self.model = load(model_fn) @@ -32,5 +32,5 @@ class HealpixGeocoder(object): vecs += np.array([ parse_bow(np.array(self.ng_index.encode(ph)),self.ng_index) for ph in phrases2 if ph]) return self.model.predict(vecs) -hp = HealpixGeocoder("SVMLINEAR_US_FR_AC.bin","outputs/US_FR.txt_100_4_0.002__A_C_index") +hp = SVMGeocoder("SVMLINEAR_US_FR_AC.bin", "../../outputs/US_FR.txt_100_4_0.002__A_C_index") hp.geocode("paris","montpellier") diff --git a/lib/geo.py b/lib/utils_geo.py similarity index 100% rename from lib/geo.py rename to lib/utils_geo.py diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py index 6ed64582e6aec435c9c06bbcbfdcf37d12fa2959..ec5d967b0a4f4c5f473147aa96bb66c7bc3737cf 100644 --- a/predict_toponym_coordinates.py +++ b/predict_toponym_coordinates.py @@ -10,7 +10,7 @@ from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.models import load_model -from lib.geo import haversine_tf_1circle +from lib.utils_geo import haversine_tf_1circle sess = None graph = None diff --git a/region_model.py b/region_model.py index 906a422ba6ac0d8ecd4003325f8462f6fb70ec44..b7a66738ccc07b3403da4da5772b55afa7c816c5 100644 --- a/region_model.py +++ b/region_model.py @@ -18,7 +18,7 @@ from lib.ngram_index import NgramIndex from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder from lib.metrics import lat_accuracy,lon_accuracy from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle +from lib.utils_geo import haversine_tf,accuracy_k,haversine_tf_1circle # Logging import logging diff --git a/run_train.py b/run_multiple_configurations.py similarity index 100% rename from run_train.py rename to run_multiple_configurations.py diff --git a/scripts/embeddingngram.py b/scripts/embeddingngram.py index ac5ec02210db62d51d351c071c158671ad935703..a9773eb58578780cef70eb28dd2315f9f61fbd97 100644 --- a/scripts/embeddingngram.py +++ b/scripts/embeddingngram.py @@ -3,7 +3,7 @@ from lib.ngram_index import NgramIndex -from lib.geo import read_geonames +from lib.utils_geo import read_geonames diff --git a/scripts/get_all_adjacency_rel.py b/scripts/get_all_adjacency_rel.py index 66fad5af9867cd1fb73404b38f93471a3cf9a622..23382a6a79b5f75594ca640169afeaa4fcedee9a 100644 --- a/scripts/get_all_adjacency_rel.py +++ b/scripts/get_all_adjacency_rel.py @@ -4,7 +4,7 @@ from helpers import read_geonames from tqdm import tqdm from joblib import Parallel,delayed import geopandas as gpd -from lib.geo import Grid,haversine_pd +from lib.utils_geo import Grid,haversine_pd import matplotlib.pyplot as plt import argparse diff --git a/scripts/randoludo.py b/scripts/randoludo.py index b933c9c5136bf6eed2c6497df5d175fcc6f6c767..3862d526b15f9d337f791cefa4e81e037c64682f 100644 --- a/scripts/randoludo.py +++ b/scripts/randoludo.py @@ -45,6 +45,6 @@ for ix,group in df.groupby("filename"): dd = pd.DataFrame(results_fin).rename(columns={"lat":"lat_pred","lon":"lon_pred"}) df2 = pd.concat((df,dd),axis=1) -from lib.geo import haversine_pd +from lib.utils_geo import haversine_pd df2["dist_error"] = haversine_pd(df2.longitude,df2.latitude,df2.lon_pred,df2.lat_pred) print(df2.dist_error.mean()) diff --git a/bert.py b/train_bert_geocoder.py similarity index 87% rename from bert.py rename to train_bert_geocoder.py index df3e1349a918ba82583202d6429fb27815b9b301..7dfa289e8cc410d55d722b1016f99f7248a1cd3c 100644 --- a/bert.py +++ b/train_bert_geocoder.py @@ -102,62 +102,17 @@ else: print('Loading {0} tokenizer...'.format("bert-base-multilingual-cased")) tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False) -""" -print("Tokenize Input Data") -df_train["input_ids"] = df_train.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True)) -df_test["input_ids"] = df_test.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True)) - - -# Set the maximum sequence length. -# took the size of the largest sentence -MAX_LEN = df_train.input_ids.apply(len).max()+2 - -print('\nPadding/truncating all sentences to %d values...' % MAX_LEN) -print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id)) - - -df_train["input_ids"] = pad_sequences(df_train.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist() -df_test["input_ids"] = pad_sequences(df_test.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist() - -df_train["attention_mask"] = df_train.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x] ) -df_test["attention_mask"] = df_test.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x]) - -train_inputs = torch.tensor(np.array(df_train.input_ids.values.tolist())) -del df_train["input_ids"] -validation_inputs = torch.tensor(np.array(df_test.input_ids.values.tolist())) -del df_test["input_ids"] - -train_labels = torch.tensor(np.array(df_train.label.values.tolist())) -del df_train["label"] -validation_labels = torch.tensor(np.array(df_test.label.values.tolist())) -del df_test["label"] - -train_masks = torch.tensor(np.array(df_train.attention_mask.values.tolist())) -del df_train["attention_mask"] -validation_masks = torch.tensor(np.array(df_test.attention_mask.values.tolist())) -del df_test["attention_mask"] -""" - from lib.torch_generator import SentenceDataset # Create the DataLoader for training set. train_data = SentenceDataset(df_train,tokenizer,batch_size=batch_size) -#train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, batch_size=batch_size)#,sampler=train_sampler,) -""" -del train_inputs -del train_masks -del train_labels -""" + # Create the DataLoader for validation set. validation_data = SentenceDataset(df_test,tokenizer,batch_size=batch_size) #validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, batch_size=batch_size)#, sampler=validation_sampler) -""" -del validation_inputs -del validation_masks -del validation_labels -""" -# Load BertForSequenceClassification, the pretrained BERT model with a single + +# Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( "bert-base-multilingual-cased", # Use the 12-layer BERT model, with an uncased vocab. diff --git a/combination_embeddingsv3.py b/train_geocoder.py similarity index 98% rename from combination_embeddingsv3.py rename to train_geocoder.py index b1d3745490405c7bf491cc574c0fa7a5935278a3..71ff546eba853c4fdbf0c8eeff263021df737c15 100644 --- a/combination_embeddingsv3.py +++ b/train_geocoder.py @@ -21,11 +21,11 @@ from shapely.geometry import Point #Â Custom module from helpers import read_geonames -from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.utils_geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds from lib.ngram_index import NgramIndex from lib.utils import ConfigurationReader from lib.metrics import lat_accuracy,lon_accuracy -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle +from lib.utils_geo import haversine_tf,accuracy_k,haversine_tf_1circle # Logging diff --git a/combination_embeddingsv4.py b/train_geocoder_v2.py similarity index 98% rename from combination_embeddingsv4.py rename to train_geocoder_v2.py index c967f6fdad9ce2534d8255edcf65949affdcdfdc..f1b1ddfd1b21ae9424a6c8dd671446b696d6164d 100644 --- a/combination_embeddingsv4.py +++ b/train_geocoder_v2.py @@ -12,10 +12,10 @@ from keras.models import Model from keras.callbacks import ModelCheckpoint #Â Custom module -from lib.geo import zero_one_encoding +from lib.utils_geo import zero_one_encoding from lib.ngram_index import NgramIndex from lib.utils import ConfigurationReader -from lib.geo import accuracy_k,haversine_tf_1circle +from lib.utils_geo import accuracy_k,haversine_tf_1circle from helpers import EpochTimer # Logging diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py index 4438aaf6a210b6b95d442a87e6866e833397a175..47fb607a8b2a64eade2da20d1a3de7159fff5f18 100644 --- a/train_test_split_cooccurrence_data.py +++ b/train_test_split_cooccurrence_data.py @@ -14,7 +14,7 @@ logging.basicConfig( from sklearn.model_selection import train_test_split from shapely.geometry import Point -from lib.geo import latlon2healpix +from lib.utils_geo import latlon2healpix from tqdm import tqdm diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py index 6098c50cbf8f56f2f123c8cc68aaecbe2b105e49..9aaf44907cbe713af2570c4256ad27d447c6ad97 100644 --- a/train_test_split_geonames.py +++ b/train_test_split_geonames.py @@ -13,7 +13,7 @@ logging.basicConfig( from sklearn.model_selection import train_test_split -from lib.geo import latlon2healpix +from lib.utils_geo import latlon2healpix from helpers import read_geonames from tqdm import tqdm