From e7c14b5d417c5782c93f88d8e33c964a5bf9b24c Mon Sep 17 00:00:00 2001 From: Jacques Fize <jacques.fize@insa-lyon.fr> Date: Wed, 23 Sep 2020 16:50:48 +0200 Subject: [PATCH] UPD --- combination_embeddingsv4.py | 309 ++++-------------- generate_dataset.py | 148 +++++++++ lib/datageneratorv4.py | 24 ++ .../toponym_combination_embedding_v3.json | 18 + 4 files changed, 251 insertions(+), 248 deletions(-) create mode 100644 generate_dataset.py create mode 100644 lib/datageneratorv4.py create mode 100644 parser_config/toponym_combination_embedding_v3.json diff --git a/combination_embeddingsv4.py b/combination_embeddingsv4.py index b1d3745..cfbd49d 100644 --- a/combination_embeddingsv4.py +++ b/combination_embeddingsv4.py @@ -1,78 +1,35 @@ # Base module -import re import os -import json +import sys #Â Structure import pandas as pd import numpy as np -import geopandas as gpd #Â DEEPL module from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout from keras.models import Model -from keras import backend as K from keras.callbacks import ModelCheckpoint -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - #Â Custom module -from helpers import read_geonames -from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.geo import zero_one_encoding from lib.ngram_index import NgramIndex from lib.utils import ConfigurationReader -from lib.metrics import lat_accuracy,lon_accuracy -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle - +from lib.geo import accuracy_k,haversine_tf_1circle +from helpers import EpochTimer # Logging -from tqdm import tqdm import logging -from helpers import parse_title_wiki,EpochTimer - logging.getLogger('gensim').setLevel(logging.WARNING) - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - -# LOGGING CONF -logging.basicConfig( +logging.basicConfig( # LOGGING CONF format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO ) -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) +#Â COMMAND ARGS +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ + .parse_args("FR FR_inclusion.csv FR_adjacent.csv FR_cooc.csv -i -a -w".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) # ################################################# @@ -82,235 +39,108 @@ MODEL_NAME = "Bi-LSTM_NGRAM" NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 256 +EMBEDDING_DIM = args.dimension ################################################# ########## FILENAME VARIABLE #################### ################################################# -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - -COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) - -REL_CODE="" +INCLUSION_FN = args.geoname_inclusion +ADJACENT_FN = args.geonames_adjacent +COOC_FN = args.wikipedia_cooc + +DATASET_NAME = args.dataset_name + +PREFIX_OUTPUT_FN = DATASET_NAME +PREFIX_OUTPUT_FN+="_{0}".format(NGRAM_SIZE) +PREFIX_OUTPUT_FN+="_{0}".format(EPOCHS) + if args.adjacency: PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" if args.inclusion: PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" if args.wikipedia_cooc: PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) -from lib.utils import MetaDataSerializer - -meta_data = MetaDataSerializer( - MODEL_NAME, - DATASET_NAME, - REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - ############################################################################################# ################################# LOAD DATA ################################################# ############################################################################################# -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") - -train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) -test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) - -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -#CLEAR RAM -del geoname_data +data_used = [] +if args.wikipedia: + data_used.append(pd.read_csv(COOC_FN,sep="\t")) -# IF REGION -if args.admin_code_1 != "None": - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() +if args.inclusion: + data_used.append(pd.read_csv(INCLUSION_FN,sep="\t")) -# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD +if args.adjacency: + data_used.append(pd.read_csv(ADJACENT_FN, sep="\t")) +if len(data_used) <1: + print("No Type of toponyms indicated. Stopping the program...") + sys.exit(1) +pairs_of_toponym = pd.concat(data_used) ############################################################################################# ################################# RETRIEVE RELATIONSHIPS #################################### ############################################################################################# - -# INITIALIZE RELATION STORE -rel_store = [] - -# Retrieve adjacency relationships -if args.adjacency: - logging.info("Retrieve adjacency relationships ! ") - - if not os.path.exists(ADJACENCY_REL_FILENAME): - bounds = get_bounds(filtered) # Required to get adjacency relationships - rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) - json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=json.load(open(ADJACENCY_REL_FILENAME)) - - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -# Retrieve inclusion relationships -if args.inclusion: - logging.info("Retrieve inclusion relationships ! ") - - cpt_rel = len(rel_store) - rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) - - logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) - - - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") - if not "title" in train_cooc_indices: - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) # Identify all ngram available -filtered.name.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] +pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) +pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) -geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association - -if args.wikipedia_cooc: - geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) +num_words = len(index.index_ngram) # necessary for the embedding matrix # SAVE THE INDEX TO REUSE THE MODEL index.save(INDEX_FN) - logging.info("Done !") - ############################################################################################# -################################# ENCODE COORDINATES ######################################## +################################# NGRAM EMBEDDINGS ########################################## ############################################################################################# - - -# Encode each geonames entry coordinates -geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} -# CLEAR RAM -del filtered - - -EMBEDDING_DIM = 256 -num_words = len(index.index_ngram) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - +logging.info("Generating N-GRAM Embedding...") +embedding_weights = index.get_embedding_layer(np.concatenate((pairs_of_toponym.toponym.values,pairs_of_toponym.toponym_context.values)),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) +logging.info("Embedding generated !") ############################################################################################# ################################# BUILD TRAIN/TEST DATASETS ################################# ############################################################################################# +logging.info("Preparing Input and Output data...") -X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] -X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] +X_1_train,X_2_train=[],[] +X_1_test,X_2_test=[],[] y_train,y_test = [],[] -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_1_train.append(top1) - X_2_train.append(top2) - - y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) - #y_lon_train.append(geoname_vec[geonameId_1][0]) - #y_lat_train.append(geoname_vec[geonameId_1][1]) - +for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples(): + top,top_c,split_ = couple[1], couple[2], couple[3] + coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding + enc_top, enc_top_c = index.encode(top),index.encode(top_c) + if split_ == "train": + X_1_train.append(enc_top) + X_2_train.append(enc_top_c) + y_train.append(list(coord)) else: - X_1_test.append(top1) - X_2_test.append(top2) + X_1_test.append(enc_top) + X_2_test.append(enc_top_c) + y_test.append(list(coord)) - y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) - #y_lon_test.append(geoname_vec[geonameId_1][0]) - #y_lat_test.append(geoname_vec[geonameId_1][1]) - -# NUMPYZE inputs and output lists +# "NUMPYZE" inputs and output lists X_1_train = np.array(X_1_train) X_2_train = np.array(X_2_train) -y_lat_train = np.array(y_lat_train) -y_lon_train = np.array(y_lon_train) y_train = np.array(y_train) X_1_test = np.array(X_1_test) X_2_test = np.array(X_2_test) -y_lat_test = np.array(y_lat_test) -y_lon_test = np.array(y_lon_test) y_test = np.array(y_test) logging.info("Data prepared !") @@ -320,20 +150,11 @@ logging.info("Data prepared !") if not os.path.exists("outputs/"): os.makedirs("outputs/") -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - - -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) -logging.info("Embedding generated !") ############################################################################################# ################################# MODEL DEFINITION ########################################## ############################################################################################# - input_1 = Input(shape=(index.max_len,)) input_2 = Input(shape=(index.max_len,)) @@ -343,20 +164,15 @@ x1 = embedding_layer(input_1) x2 = embedding_layer(input_2) #Â Each LSTM learn on a permutation of the input toponyms -x1 = Bidirectional(LSTM(98))(x1) -x2 = Bidirectional(LSTM(98))(x2) - -x = concatenate([x1,x2])#,x3]) +x1 = Bidirectional(LSTM(100))(x1) +x2 = Bidirectional(LSTM(100))(x2) +x = concatenate([x1,x2]) x1 = Dense(500,activation="relu")(x) -# x1 = Dropout(0.3)(x1) x1 = Dense(500,activation="relu")(x1) -# x1 = Dropout(0.3)(x1) x2 = Dense(500,activation="relu")(x) -# x2 = Dropout(0.3)(x2) x2 = Dense(500,activation="relu")(x2) -# x2 = Dropout(0.3)(x2) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) @@ -364,14 +180,8 @@ output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) output_coord = concatenate([output_lon,output_lat],name="output_coord") model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 - model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) -# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - -# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) - - ############################################################################################# ################################# TRAINING LAUNCH ########################################### ############################################################################################# @@ -383,7 +193,7 @@ epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") history = model.fit(x=[X_1_train,X_2_train], - y=y_train,#[y_lon_train,y_lat_train], + y=y_train, verbose=True, batch_size=100, epochs=EPOCHS, validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), @@ -397,5 +207,8 @@ model.save(MODEL_OUTPUT_FN) #Â Erase Model Checkpoint file if os.path.exists(MODEL_OUTPUT_FN + ".part"): - import shutil - shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file + try: + import shutil + shutil.rmtree(MODEL_OUTPUT_FN + ".part") + except: #Â Depends on Keras version + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/generate_dataset.py b/generate_dataset.py new file mode 100644 index 0000000..4e873df --- /dev/null +++ b/generate_dataset.py @@ -0,0 +1,148 @@ +import argparse + +import pandas as pd +import numpy as np + +from helpers import read_geonames +from lib.geo import latlon2healpix + +from tqdm import tqdm +from sklearn.model_selection import train_test_split + +parser = argparse.ArgumentParser() + +parser.add_argument("geonames_dataset") +parser.add_argument("wikipedia_dataset") +parser.add_argument("geonames_hierarchy_data") +parser.add_argument("--cooc-sampling", default=4, type=int) +parser.add_argument("--adj-sampling", default=4, type=int) +parser.add_argument("--adj-nside", default=128, type=int) +parser.add_argument("--split-nside", default=128, type=int) +parser.add_argument("--split-method", default="per_pair", type=str, choices="per_pair per_place".split()) + +args = parser.parse_args()#("../data/geonamesData/FR.txt ../data/wikipedia/cooccurrence_FR.txt ../data/geonamesData/hierarchy.txt".split()) + +PREFIX = args.geonames_dataset.split("/")[-1].split(".")[0] # Ouch ! + +# Â LOAD DATA +geonames_data = read_geonames(args.geonames_dataset) +wikipedia_data = pd.read_csv(args.wikipedia_dataset, sep="\t") +geonames_hierarchy_data = pd.read_csv(args.geonames_hierarchy_data, sep="\t", header=None, + names="parentId,childId,type".split(",")).fillna("") + +# Add IDs for the Wikipedia Cooc Dataset +min_id = geonames_data.geonameid.max() + 1 +max_id = min_id + len(wikipedia_data) +wikipedia_data["geonameid"] = np.arange(min_id, max_id) + +# Â Healpix cell id computation +geonames_data["adj_split"] = geonames_data.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.adj_nside), + axis=1) + + +def get_adjacent_pairs(dataframe, sampling_nb): + """ + Return pairs of place toponyms that are adjacent geographicaly. + Parameters + ---------- + dataframe : pandas.DataFrame + geonames data + sampling_nb : int + number of adjacent place drawn + + Returns + ------- + list of list + [[ID,place toponym,adjacent place toponym, latitude, longitude],...] + """ + new_pairs = [] + for ix, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Get Adjacent Toponym Pairs"): + healpix_cell = row.adj_split + topo_prin = row["name"] + lat, lon = row.latitude, row.longitude + within_cell = dataframe[dataframe.adj_split == healpix_cell]["name"].values + selected = np.random.choice(within_cell, sampling_nb) + new_pairs.extend([[row.geonameid, topo_prin, sel, lat, lon] for sel in selected]) + return new_pairs + + +def get_cooccurrence_pairs(dataframe, sampling_nb): + """ + Return pairs of place toponyms where toponyms appears in a same wikipedia page + Parameters + ---------- + dataframe : pandas.DataFrame + wikipedia cooccurrence data + sampling_nb : int + number of adjacent place drawn + + Returns + ------- + list of list + [[ID,place toponym,adjacent place toponym, latitude, longitude],...] + """ + new_pairs = [] + dataframe["interlinks"] = dataframe.interlinks.apply(lambda x: np.random.choice(x.split("|"), sampling_nb)) + for ix, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Get Cooccurrent Toponym Pairs"): + topo_prin = row.title + lat, lon = row.latitude, row.longitude + new_pairs.extend([[row.geonameid, topo_prin, sel, lat, lon] for sel in row["interlinks"]]) + return new_pairs + + +def get_inclusion_pairs(geoname_df, hierarchy_df): + """ + Return pairs of place toponyms that share an inclusion relationship. Ex. Paris, France (Paris geometry is included in France geometry) + Parameters + ---------- + dataframe : pandas.DataFrame + geonames data + hierarchy_df : pandas.DataFrame + geonames hierarchy data + + Returns + ------- + list of list + [[ID,place toponym,adjacent place toponym, latitude, longitude],...] + """ + geonamesIDS = set(geoname_df.geonameid.values) + id_label = dict(geonames_data["geonameid name".split()].values) + id_lat = dict(geonames_data["geonameid latitude".split()].values) + id_lon = dict(geonames_data["geonameid longitude".split()].values) + filter_mask = (hierarchy_df.childId.isin(geonamesIDS) & hierarchy_df.parentId.isin(geonamesIDS)) + pairs_id = hierarchy_df[filter_mask]["childId parentId".split()].values.tolist() + return [[p[0], id_label[p[0]], id_label[p[1]], id_lat[p[0]], id_lon[p[0]]] for p in pairs_id] + +# EXTRACT PAIRS FROM INPUT DATA +cooc_pairs = pd.DataFrame(get_cooccurrence_pairs(wikipedia_data, args.cooc_sampling), + columns="ID toponym toponym_context latitude longitude".split()) +adjacent_pairs = pd.DataFrame(get_adjacent_pairs(geonames_data, args.adj_sampling), + columns="ID toponym toponym_context latitude longitude".split()) +inclusion_pairs = pd.DataFrame(get_inclusion_pairs(geonames_data, geonames_hierarchy_data), + columns="ID toponym toponym_context latitude longitude".split()) + +# FOR EACH PAIR, COMPUTE THE HEALPIX CELL ID FOR EACH COORDINATES ASSOCIATED +cooc_pairs["hp_split"] = cooc_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside), axis=1) +adjacent_pairs["hp_split"] = adjacent_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside), + axis=1) +inclusion_pairs["hp_split"] = inclusion_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside), + axis=1) + +# SPLIT DATASETS BETWEEN TRAIN AND TEST GEOGRAPHICALY +field = "hp_split" +if args.split_method == "per_place": + field = "ID" + +for df in [cooc_pairs, adjacent_pairs]: + df_train, _ = train_test_split(df, stratify=df[field].values, test_size=0.33) + df["split"] = "test" + df.loc[df_train.index.values, "split"] = "train" + +inc_train, _ = train_test_split(inclusion_pairs, test_size=0.33) +inclusion_pairs["split"] = "test" +inclusion_pairs.loc[inc_train.index.values, "split"] = "train" + +#Â SAVE DATA +inclusion_pairs.to_csv("{0}_inclusion.csv".format(PREFIX), sep="\t") +adjacent_pairs.to_csv("{0}_adjacent.csv".format(PREFIX), sep="\t") +cooc_pairs.to_csv("{0}_cooc.csv".format(PREFIX), sep="\t") diff --git a/lib/datageneratorv4.py b/lib/datageneratorv4.py new file mode 100644 index 0000000..f413a63 --- /dev/null +++ b/lib/datageneratorv4.py @@ -0,0 +1,24 @@ +import os +from gzip import GzipFile + +import keras +from keras.utils import to_categorical +import numpy as np +import pandas as pd + +from .geo import zero_one_encoding + +from helpers import parse_title_wiki,read_geonames +from gensim.models.keyedvectors import KeyedVectors + +from sklearn.preprocessing import LabelEncoder + + +class DataGenerator(keras.utils.Sequence): + def __init__(self,*dataset): + pass + def __next__(self): + pass + def isOver(self): + pass + diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json new file mode 100644 index 0000000..37f9ae5 --- /dev/null +++ b/parser_config/toponym_combination_embedding_v3.json @@ -0,0 +1,18 @@ +{ + "description": "Toponym Combination", + "args": [ + { "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." }, + { "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." }, + { "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." }, + { "long": "wikipedia_cooc","help":"Cooccurrence data filename"}, + { "short": "-v", "long": "--verbose", "action": "store_true" }, + { "short": "-i", "long": "--inclusion", "action": "store_true" }, + { "short": "-a", "long": "--adjacency", "action": "store_true" }, + { "short": "-w", "long": "--wikipedia", "action": "store_true" }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, + { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, + { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, + { "short": "-d", "long": "--dimension", "type": "int", "default": 256 } + ] +} \ No newline at end of file -- GitLab