Skip to content
Snippets Groups Projects
Commit c3cf2f89 authored by Jacques Fize's avatar Jacques Fize
Browse files

UPD

parent be55584e
No related branches found
No related tags found
No related merge requests found
...@@ -72,7 +72,7 @@ logging.basicConfig( ...@@ -72,7 +72,7 @@ logging.basicConfig(
) )
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
# #
################################################# #################################################
......
# Base module
import re
import os
import json
# Structure
import pandas as pd
import numpy as np
import geopandas as gpd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
# Geometry
from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader
from lib.metrics import lat_accuracy,lon_accuracy
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
from tqdm import tqdm
import logging
from helpers import parse_title_wiki,EpochTimer
logging.getLogger('gensim').setLevel(logging.WARNING)
def get_new_ids(cooc_data,id_first_value):
"""
Return new ids from cooccurrence data
Parameters
----------
cooc_data : pd.DataFrame
cooccurrence da
id_first_value : int
id beginning value
Returns
-------
dict
new ids for each toponyms
"""
topo_id = {}
id_ = id_first_value
for title in cooc_data.title.values:
if not title in topo_id:
id_+=1
topo_id[id_]=title
for interlinks in cooc_data.interlinks.values:
for interlink in interlinks.split("|"):
if not interlink in topo_id:
id_+=1
topo_id[id_]=interlink
return topo_id
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
MODEL_NAME = "Bi-LSTM_NGRAM"
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = 50#args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = args.ngram_word2vec_iter
EMBEDDING_DIM = 256
#################################################
########## FILENAME VARIABLE ####################
#################################################
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
GEONAME_FN,
ITER_ADJACENCY,
REGION_SUFFIX_FN)
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
REGION_SUFFIX_FN)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
from lib.utils import MetaDataSerializer
meta_data = MetaDataSerializer(
MODEL_NAME,
DATASET_NAME,
REL_CODE,
COOC_SAMPLING_NUMBER,
ITER_ADJACENCY,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
#############################################################################################
################################# LOAD DATA #################################################
#############################################################################################
# LOAD Geonames DATA
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
#CLEAR RAM
del geoname_data
# IF REGION
if args.admin_code_1 != "None":
filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
#############################################################################################
################################# RETRIEVE RELATIONSHIPS ####################################
#############################################################################################
# INITIALIZE RELATION STORE
rel_store = []
# Retrieve adjacency relationships
if args.adjacency:
logging.info("Retrieve adjacency relationships ! ")
if not os.path.exists(ADJACENCY_REL_FILENAME):
bounds = get_bounds(filtered) # Required to get adjacency relationships
rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
else:
logging.info("Open and load data from previous computation!")
rel_store=json.load(open(ADJACENCY_REL_FILENAME))
logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
# Retrieve inclusion relationships
if args.inclusion:
logging.info("Retrieve inclusion relationships ! ")
cpt_rel = len(rel_store)
rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
if args.wikipedia_cooc:
logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
cooc_data = pd.read_csv(COOC_FN,sep="\t")
cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
if not "title" in train_cooc_indices:
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
logging.info("Merged with Geonames data !")
# EXTRACT rel
logging.info("Extracting cooccurrence relationships")
cpt=0
for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
cpt+=1
rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
logging.info("Extract {0} cooccurrence relationships !".format(cpt))
# STORE ID to name
geoname2name = dict(filtered["geonameid name".split()].values)
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE)
# Identify all ngram available
filtered.name.apply(lambda x : index.split_and_add(x))
if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc:
geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
# SAVE THE INDEX TO REUSE THE MODEL
index.save(INDEX_FN)
logging.info("Done !")
#############################################################################################
################################# ENCODE COORDINATES ########################################
#############################################################################################
# Encode each geonames entry coordinates
geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
# CLEAR RAM
del filtered
EMBEDDING_DIM = 256
num_words = len(index.index_ngram) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...")
#############################################################################################
################################# BUILD TRAIN/TEST DATASETS #################################
#############################################################################################
X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
y_train,y_test = [],[]
for couple in rel_store:
geonameId_1,geonameId_2 = couple[0],couple[1]
if not geonameId_1 in geoname2encodedname:
continue
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
X_1_train.append(top1)
X_2_train.append(top2)
y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]])
#y_lon_train.append(geoname_vec[geonameId_1][0])
#y_lat_train.append(geoname_vec[geonameId_1][1])
else:
X_1_test.append(top2)
X_2_test.append(top1)
y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]])
#y_lon_test.append(geoname_vec[geonameId_1][0])
#y_lat_test.append(geoname_vec[geonameId_1][1])
# NUMPYZE inputs and output lists
X_1_train = np.array(X_1_train)
X_2_train = np.array(X_2_train)
y_lat_train = np.array(y_lat_train)
y_lon_train = np.array(y_lon_train)
y_train = np.array(y_train)
X_1_test = np.array(X_1_test)
X_2_test = np.array(X_2_test)
y_lat_test = np.array(y_lat_test)
y_lon_test = np.array(y_lon_test)
y_test = np.array(y_test)
logging.info("Data prepared !")
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
logging.info("Embedding generated !")
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1)
x2 = Bidirectional(LSTM(98))(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
# x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
# x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
# x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
# x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
output_coord = concatenate([output_lon,output_lat],name="output_coord")
model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit(x=[X_1_train,X_2_train],
y=y_train,#[y_lon_train,y_lat_train],
verbose=True, batch_size=100,
epochs=EPOCHS,
validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
import shutil
shutil.rmtree(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
# Base module
import re
import os
import json
# Structure
import pandas as pd
import numpy as np
import geopandas as gpd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
# Geometry
from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader
from lib.metrics import lat_accuracy,lon_accuracy
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
from tqdm import tqdm
import logging
from helpers import parse_title_wiki,EpochTimer
logging.getLogger('gensim').setLevel(logging.WARNING)
def get_new_ids(cooc_data,id_first_value):
"""
Return new ids from cooccurrence data
Parameters
----------
cooc_data : pd.DataFrame
cooccurrence da
id_first_value : int
id beginning value
Returns
-------
dict
new ids for each toponyms
"""
topo_id = {}
id_ = id_first_value
for title in cooc_data.title.values:
if not title in topo_id:
id_+=1
topo_id[id_]=title
for interlinks in cooc_data.interlinks.values:
for interlink in interlinks.split("|"):
if not interlink in topo_id:
id_+=1
topo_id[id_]=interlink
return topo_id
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
MODEL_NAME = "Bi-LSTM_NGRAM"
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = args.ngram_word2vec_iter
EMBEDDING_DIM = 256
#################################################
########## FILENAME VARIABLE ####################
#################################################
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
GEONAME_FN,
ITER_ADJACENCY,
REGION_SUFFIX_FN)
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
REGION_SUFFIX_FN)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
from lib.utils import MetaDataSerializer
meta_data = MetaDataSerializer(
MODEL_NAME,
DATASET_NAME,
REL_CODE,
COOC_SAMPLING_NUMBER,
ITER_ADJACENCY,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
#############################################################################################
################################# LOAD DATA #################################################
#############################################################################################
# LOAD Geonames DATA
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
#CLEAR RAM
del geoname_data
# IF REGION
if args.admin_code_1 != "None":
filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
#############################################################################################
################################# RETRIEVE RELATIONSHIPS ####################################
#############################################################################################
# INITIALIZE RELATION STORE
rel_store = []
# Retrieve adjacency relationships
if args.adjacency:
logging.info("Retrieve adjacency relationships ! ")
if not os.path.exists(ADJACENCY_REL_FILENAME):
bounds = get_bounds(filtered) # Required to get adjacency relationships
rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
else:
logging.info("Open and load data from previous computation!")
rel_store=json.load(open(ADJACENCY_REL_FILENAME))
logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
# Retrieve inclusion relationships
if args.inclusion:
logging.info("Retrieve inclusion relationships ! ")
cpt_rel = len(rel_store)
rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
if args.wikipedia_cooc:
logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
cooc_data = pd.read_csv(COOC_FN,sep="\t")
cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
if not "title" in train_cooc_indices:
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
logging.info("Merged with Geonames data !")
# EXTRACT rel
logging.info("Extracting cooccurrence relationships")
cpt=0
for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
cpt+=1
rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
logging.info("Extract {0} cooccurrence relationships !".format(cpt))
# STORE ID to name
geoname2name = dict(filtered["geonameid name".split()].values)
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE)
# Identify all ngram available
filtered.name.apply(lambda x : index.split_and_add(x))
if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc:
geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
# SAVE THE INDEX TO REUSE THE MODEL
index.save(INDEX_FN)
logging.info("Done !")
#############################################################################################
################################# ENCODE COORDINATES ########################################
#############################################################################################
# Encode each geonames entry coordinates
geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
# CLEAR RAM
del filtered
EMBEDDING_DIM = 256
num_words = len(index.index_ngram) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...")
#############################################################################################
################################# BUILD TRAIN/TEST DATASETS #################################
#############################################################################################
X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
y_train,y_test = [],[]
for couple in rel_store:
geonameId_1,geonameId_2 = couple[0],couple[1]
if not geonameId_1 in geoname2encodedname:
continue
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
X_1_train.append(top1)
X_2_train.append(top2)
y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]])
#y_lon_train.append(geoname_vec[geonameId_1][0])
#y_lat_train.append(geoname_vec[geonameId_1][1])
else:
X_1_test.append(top1)
X_2_test.append(top2)
y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]])
#y_lon_test.append(geoname_vec[geonameId_1][0])
#y_lat_test.append(geoname_vec[geonameId_1][1])
# NUMPYZE inputs and output lists
X_1_train = np.array(X_1_train)
X_2_train = np.array(X_2_train)
y_lat_train = np.array(y_lat_train)
y_lon_train = np.array(y_lon_train)
y_train = np.array(y_train)
X_1_test = np.array(X_1_test)
X_2_test = np.array(X_2_test)
y_lat_test = np.array(y_lat_test)
y_lon_test = np.array(y_lon_test)
y_test = np.array(y_test)
logging.info("Data prepared !")
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
logging.info("Embedding generated !")
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1)
x2 = Bidirectional(LSTM(98))(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
# x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
# x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
# x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
# x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
output_coord = concatenate([output_lon,output_lat],name="output_coord")
model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit(x=[X_1_train,X_2_train],
y=y_train,#[y_lon_train,y_lat_train],
verbose=True, batch_size=100,
epochs=EPOCHS,
validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
import shutil
shutil.rmtree(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
...@@ -4,7 +4,7 @@ c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt" ...@@ -4,7 +4,7 @@ c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt"
# Init GridsearchModel # Init GridsearchModel
grid = GridSearchModel(\ grid = GridSearchModel(\
"python3 combination_embeddingsv3.py", "python3 combination_embeddingsv3inverse.py",
**OrderedDict({ # necessary because some args have to be given in a certain order **OrderedDict({ # necessary because some args have to be given in a certain order
"rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a" "rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a"
"-n":[4], "-n":[4],
......
import pandas as pd
import numpy as np
from lib.geocoder import Geocoder from lib.geocoder import Geocoder
geocoder = Geocoder("./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C.h5","./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C_index") geocoder = Geocoder("./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C.h5","./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C_index")
import pandas as pd
df = pd.read_csv("data/rando_toponymes.tsv",sep="\t") df = pd.read_csv("data/rando_toponymes.tsv",sep="\t")
df["name"]=df.name.apply(lambda x:x.split("¦")[0]) df["name"]=df.name.apply(lambda x:x.split("¦")[0])
......
...@@ -43,6 +43,12 @@ ...@@ -43,6 +43,12 @@
}).addTo(mymap); }).addTo(mymap);
{% if lat and lon %} {% if lat and lon %}
var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap);
var circle = L.circle([{{lat}}, {{lon}}], {
color: "red",
fillColor: "#f03",
fillOpacity: 0.5,
radius: 100000.0
}).addTo(mymap);
{% endif %} {% endif %}
</script> </script>
......
...@@ -46,6 +46,12 @@ ...@@ -46,6 +46,12 @@
var mark = L.marker([{{coords["lat"]}}, {{coords["lon"]}}],); var mark = L.marker([{{coords["lat"]}}, {{coords["lon"]}}],);
mark.bindPopup("{{place}}") mark.bindPopup("{{place}}")
mark.addTo(mymap); mark.addTo(mymap);
var circle = L.circle([{{coords["lat"]}}, {{coords["lon"]}}], {
color: "red",
fillColor: "#f03",
fillOpacity: 0.5,
radius: 100000.0
}).addTo(mymap);
{% endfor %} {% endfor %}
</script> </script>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment