Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Toponym Geocoding
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jacques Fize
Toponym Geocoding
Commits
5ea67e5e
Commit
5ea67e5e
authored
4 years ago
by
Jacques Fize
Browse files
Options
Downloads
Patches
Plain Diff
add new version of the model. NEW : loss is based on heversine distance
parent
810b1dcf
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
combination_embeddings.py
+6
-0
6 additions, 0 deletions
combination_embeddings.py
combination_embeddingsv3.py
+257
-72
257 additions, 72 deletions
combination_embeddingsv3.py
parser_config/toponym_combination_embedding.json
+1
-1
1 addition, 1 deletion
parser_config/toponym_combination_embedding.json
with
264 additions
and
73 deletions
combination_embeddings.py
+
6
−
0
View file @
5ea67e5e
...
@@ -25,6 +25,8 @@ from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inc
...
@@ -25,6 +25,8 @@ from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inc
from
lib.ngram_index
import
NgramIndex
from
lib.ngram_index
import
NgramIndex
from
lib.utils
import
ConfigurationReader
from
lib.utils
import
ConfigurationReader
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.geo
import
haversine_tf
,
accuracy_k
,
haversine_tf_1circle
# Logging
# Logging
from
tqdm
import
tqdm
from
tqdm
import
tqdm
...
@@ -300,11 +302,13 @@ X_1_train = np.array(X_1_train)
...
@@ -300,11 +302,13 @@ X_1_train = np.array(X_1_train)
X_2_train
=
np
.
array
(
X_2_train
)
X_2_train
=
np
.
array
(
X_2_train
)
y_lat_train
=
np
.
array
(
y_lat_train
)
y_lat_train
=
np
.
array
(
y_lat_train
)
y_lon_train
=
np
.
array
(
y_lon_train
)
y_lon_train
=
np
.
array
(
y_lon_train
)
y_train
=
np
.
array
(
y_train
)
X_1_test
=
np
.
array
(
X_1_test
)
X_1_test
=
np
.
array
(
X_1_test
)
X_2_test
=
np
.
array
(
X_2_test
)
X_2_test
=
np
.
array
(
X_2_test
)
y_lat_test
=
np
.
array
(
y_lat_test
)
y_lat_test
=
np
.
array
(
y_lat_test
)
y_lon_test
=
np
.
array
(
y_lon_test
)
y_lon_test
=
np
.
array
(
y_lon_test
)
y_test
=
np
.
array
(
y_test
)
logging
.
info
(
"
Data prepared !
"
)
logging
.
info
(
"
Data prepared !
"
)
...
@@ -354,6 +358,8 @@ x2 = Dense(500,activation="relu")(x2)
...
@@ -354,6 +358,8 @@ x2 = Dense(500,activation="relu")(x2)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
[
output_lon
,
output_lat
])
#input_3
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
[
output_lon
,
output_lat
])
#input_3
model
.
compile
(
loss
=
[
'
mean_squared_error
'
,
'
mean_squared_error
'
],
optimizer
=
'
adam
'
,
metrics
=
{
"
Output_LON
"
:
lon_accuracy
(),
"
Output_LAT
"
:
lat_accuracy
()})
model
.
compile
(
loss
=
[
'
mean_squared_error
'
,
'
mean_squared_error
'
],
optimizer
=
'
adam
'
,
metrics
=
{
"
Output_LON
"
:
lon_accuracy
(),
"
Output_LAT
"
:
lat_accuracy
()})
...
...
This diff is collapsed.
Click to expand it.
combination_embeddingsv3.py
+
257
−
72
View file @
5ea67e5e
# Base module
# Base module
import
re
import
os
import
os
import
json
# Structure
# Structure
import
pandas
as
pd
import
pandas
as
pd
import
numpy
as
np
import
numpy
as
np
import
geopandas
as
gpd
# DEEPL module
# DEEPL module
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
,
Dropout
,
GRU
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
,
Dropout
from
keras.models
import
Model
from
keras.models
import
Model
from
keras
import
backend
as
K
from
keras.callbacks
import
ModelCheckpoint
from
keras.callbacks
import
ModelCheckpoint
from
tensorflow.keras.layers
import
Lambda
import
keras.backend
as
K
import
tensorflow
as
tf
import
tensorflow
as
tf
from
lib.custom_layer
import
*
# Geometry
from
shapely.geometry
import
Point
# Custom module
# Custom module
from
helpers
import
read_geonames
from
lib.geo
import
Grid
,
zero_one_encoding
,
get_adjacency_rels
,
get_geonames_inclusion_rel
,
get_bounds
from
lib.ngram_index
import
NgramIndex
from
lib.ngram_index
import
NgramIndex
from
lib.utils
import
ConfigurationReader
,
MetaDataSerializer
,
LabelEncoder
from
lib.utils
import
ConfigurationReader
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.data_generator
import
DataGenerator
,
CoOccurrences
,
load_embedding
,
Inclusion
,
Adjacency
from
lib.geo
import
haversine_tf
,
accuracy_k
,
haversine_tf_1circle
from
lib.geo
import
haversine_tf
,
accuracy_k
,
haversine_tf_1circle
# Logging
# Logging
from
tqdm
import
tqdm
import
logging
import
logging
from
helpers
import
parse_title_wiki
,
EpochTimer
logging
.
getLogger
(
'
gensim
'
).
setLevel
(
logging
.
WARNING
)
logging
.
getLogger
(
'
gensim
'
).
setLevel
(
logging
.
WARNING
)
from
helpers
import
EpochTimer
def
get_new_ids
(
cooc_data
,
id_first_value
):
"""
Return new ids from cooccurrence data
Parameters
----------
cooc_data : pd.DataFrame
cooccurrence da
id_first_value : int
id beginning value
Returns
-------
dict
new ids for each toponyms
"""
topo_id
=
{}
id_
=
id_first_value
for
title
in
cooc_data
.
title
.
values
:
if
not
title
in
topo_id
:
id_
+=
1
topo_id
[
id_
]
=
title
for
interlinks
in
cooc_data
.
interlinks
.
values
:
for
interlink
in
interlinks
.
split
(
"
|
"
):
if
not
interlink
in
topo_id
:
id_
+=
1
topo_id
[
id_
]
=
interlink
return
topo_id
# LOGGING CONF
# LOGGING CONF
logging
.
basicConfig
(
logging
.
basicConfig
(
...
@@ -35,41 +71,40 @@ logging.basicConfig(
...
@@ -35,41 +71,40 @@ logging.basicConfig(
level
=
logging
.
INFO
level
=
logging
.
INFO
)
)
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding_v2.json
"
)
\
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding.json
"
)
\
.
parse_args
()
#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
.
parse_args
()
#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#
#################################################
#################################################
############# MODEL TRAINING PARAMETER ##########
############# MODEL TRAINING PARAMETER ##########
#################################################
#################################################
MODEL_NAME
=
"
Bi-LSTM_NGRAM
"
NGRAM_SIZE
=
args
.
ngram_size
NGRAM_SIZE
=
args
.
ngram_size
ACCURACY_TOLERANCE
=
args
.
k
_value
ACCURACY_TOLERANCE
=
args
.
tolerance
_value
EPOCHS
=
args
.
epochs
EPOCHS
=
args
.
epochs
ADJACENCY_SAMPLING
=
args
.
adjacency_sample
ITER_ADJACENCY
=
args
.
adjacency_iteration
COOC_SAMPLING
=
args
.
cooc_sample
COOC_SAMPLING_NUMBER
=
args
.
cooc_sample_size
WORDVEC_ITER
=
50
WORDVEC_ITER
=
args
.
ngram_word2vec_iter
EMBEDDING_DIM
=
args
.
dimension
EMBEDDING_DIM
=
256
BATCH_SIZE
=
args
.
batch_size
#################################################
#################################################
########## FILENAME VARIABLE ####################
########## FILENAME VARIABLE ####################
#################################################
#################################################
# check for output dir
if
not
os
.
path
.
exists
(
"
outputs/
"
):
os
.
makedirs
(
"
outputs/
"
)
GEONAME_FN
=
args
.
geoname_input
GEONAME_FN
=
args
.
geoname_input
DATASET_NAME
=
args
.
geoname_input
.
split
(
"
/
"
)[
-
1
]
DATASET_NAME
=
args
.
geoname_input
.
split
(
"
/
"
)[
-
1
]
GEONAMES_HIERARCHY_FN
=
args
.
inclusion_fn
GEONAMES_HIERARCHY_FN
=
args
.
geoname_hierachy_input
ADJACENCY_REL_FILENAME
=
args
.
adjacency_fn
REGION_SUFFIX_FN
=
""
if
args
.
admin_code_1
==
"
None
"
else
"
_
"
+
args
.
admin_code_1
COOC_FN
=
args
.
wikipedia_cooc_fn
ADJACENCY_REL_FILENAME
=
"
{0}_{1}{2}adjacency.json
"
.
format
(
GEONAME_FN
,
ITER_ADJACENCY
,
REGION_SUFFIX_FN
)
PREFIX_OUTPUT_FN
=
"
{0}_{1}_{2}_{3}
"
.
format
(
COOC_FN
=
args
.
wikipedia_cooc_fn
PREFIX_OUTPUT_FN
=
"
{0}_{1}_{2}_{3}_{4}
"
.
format
(
GEONAME_FN
.
split
(
"
/
"
)[
-
1
],
GEONAME_FN
.
split
(
"
/
"
)[
-
1
],
EPOCHS
,
EPOCHS
,
NGRAM_SIZE
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
)
ACCURACY_TOLERANCE
,
REGION_SUFFIX_FN
)
REL_CODE
=
""
REL_CODE
=
""
if
args
.
adjacency
:
if
args
.
adjacency
:
...
@@ -86,12 +121,14 @@ MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
...
@@ -86,12 +121,14 @@ MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN
=
"
outputs/{0}_index
"
.
format
(
PREFIX_OUTPUT_FN
)
INDEX_FN
=
"
outputs/{0}_index
"
.
format
(
PREFIX_OUTPUT_FN
)
HISTORY_FN
=
"
outputs/{0}.csv
"
.
format
(
PREFIX_OUTPUT_FN
)
HISTORY_FN
=
"
outputs/{0}.csv
"
.
format
(
PREFIX_OUTPUT_FN
)
from
lib.utils
import
MetaDataSerializer
meta_data
=
MetaDataSerializer
(
meta_data
=
MetaDataSerializer
(
MODEL_NAME
,
DATASET_NAME
,
DATASET_NAME
,
REL_CODE
,
REL_CODE
,
COOC_SAMPLING
,
COOC_SAMPLING
_NUMBER
,
ADJACENCY
_SAMPLING
,
ITER_
ADJACENCY
,
NGRAM_SIZE
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
,
ACCURACY_TOLERANCE
,
EPOCHS
,
EPOCHS
,
...
@@ -103,92 +140,238 @@ meta_data = MetaDataSerializer(
...
@@ -103,92 +140,238 @@ meta_data = MetaDataSerializer(
)
)
meta_data
.
save
(
"
outputs/{0}.json
"
.
format
(
PREFIX_OUTPUT_FN
))
meta_data
.
save
(
"
outputs/{0}.json
"
.
format
(
PREFIX_OUTPUT_FN
))
#############################################################################################
################################# LOAD DATA #################################################
#############################################################################################
### PUT DATASRC + GENERATOR
# LOAD Geonames DATA
logging
.
info
(
"
Load Geonames data...
"
)
geoname_data
=
read_geonames
(
GEONAME_FN
).
fillna
(
""
)
index
=
NgramIndex
.
load
(
args
.
ngram_index_fn
)
train_indices
=
set
(
pd
.
read_csv
(
GEONAME_FN
+
"
_train.csv
"
).
geonameid
.
values
)
test_indices
=
set
(
pd
.
read_csv
(
GEONAME_FN
+
"
_test.csv
"
).
geonameid
.
values
)
train_src
=
[]
logging
.
info
(
"
Geonames data loaded!
"
)
test_src
=
[]
class_encoder
=
LabelEncoder
()
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered
=
geoname_data
[
geoname_data
.
feature_class
.
isin
(
"
A P
"
.
split
())].
copy
()
# Only take area and populated places
#CLEAR RAM
del
geoname_data
if
args
.
wikipedia_cooc
:
train_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_train.csv
"
,
class_encoder
,
sampling
=
4
,
use_healpix
=
False
))
test_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_test.csv
"
,
class_encoder
,
sampling
=
4
,
use_healpix
=
False
))
# IF REGION
if
args
.
admin_code_1
!=
"
None
"
:
filtered
=
filtered
[
filtered
.
admin1_code
==
args
.
admin_code_1
].
copy
()
# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered
=
filtered
[
"
geonameid name longitude latitude
"
.
split
()]
# KEEP ONLY ID LABEL AND COORD
#############################################################################################
################################# RETRIEVE RELATIONSHIPS ####################################
#############################################################################################
# INITIALIZE RELATION STORE
rel_store
=
[]
# Retrieve adjacency relationships
if
args
.
adjacency
:
if
args
.
adjacency
:
a_train
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_train.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
logging
.
info
(
"
Retrieve adjacency relationships !
"
)
a_test
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_test.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
train_src
.
append
(
a_train
)
if
not
os
.
path
.
exists
(
ADJACENCY_REL_FILENAME
):
test_src
.
append
(
a_test
)
bounds
=
get_bounds
(
filtered
)
# Required to get adjacency relationships
rel_store
.
extend
(
get_adjacency_rels
(
filtered
,
bounds
,[
360
,
180
],
ITER_ADJACENCY
))
json
.
dump
(
rel_store
,
open
(
ADJACENCY_REL_FILENAME
,
'
w
'
))
else
:
logging
.
info
(
"
Open and load data from previous computation!
"
)
rel_store
=
json
.
load
(
open
(
ADJACENCY_REL_FILENAME
))
logging
.
info
(
"
{0} adjacency relationships retrieved !
"
.
format
(
len
(
rel_store
)))
# Retrieve inclusion relationships
if
args
.
inclusion
:
if
args
.
inclusion
:
i_train
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_train.csv
"
)
logging
.
info
(
"
Retrieve inclusion relationships !
"
)
i_test
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_test.csv
"
)
train_src
.
append
(
i_train
)
cpt_rel
=
len
(
rel_store
)
test_src
.
append
(
i_test
)
rel_store
.
extend
(
get_geonames_inclusion_rel
(
filtered
,
GEONAMES_HIERARCHY_FN
))
#Adjacency
print
(
"
Number of classes:
"
,
class_encoder
.
get_num_classes
(
))
logging
.
info
(
"
{0} inclusion relationships retrieved !
"
.
format
(
len
(
rel_store
)
-
cpt_rel
))
d_train
=
DataGenerator
(
train_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
d_test
=
DataGenerator
(
test_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
num_words
=
len
(
index
.
index_ngram
)
if
args
.
wikipedia_cooc
:
logging
.
info
(
"
Load Wikipedia Cooccurrence data and merge with geonames
"
)
cooc_data
=
pd
.
read_csv
(
COOC_FN
,
sep
=
"
\t
"
)
cooc_data
[
"
title
"
]
=
cooc_data
.
title
.
apply
(
parse_title_wiki
)
cooc_data
[
"
interlinks
"
]
=
cooc_data
.
interlinks
.
apply
(
parse_title_wiki
)
id_wikipediatitle
=
get_new_ids
(
cooc_data
,
filtered
.
geonameid
.
max
())
wikipediatitle_id
=
{
v
:
k
for
k
,
v
in
id_wikipediatitle
.
items
()}
title_coord
=
{
row
.
title
:
(
row
.
longitude
,
row
.
latitude
)
for
_
,
row
in
tqdm
(
cooc_data
.
iterrows
(),
total
=
len
(
cooc_data
))}
cooc_data
[
"
geonameid
"
]
=
cooc_data
.
title
.
apply
(
lambda
x
:
wikipediatitle_id
[
x
])
filtered
=
pd
.
concat
((
filtered
,
cooc_data
[
"
geonameid title longitude latitude
"
.
split
()].
rename
(
columns
=
{
"
title
"
:
"
name
"
}).
copy
()))
train_cooc_indices
,
test_cooc_indices
=
pd
.
read_csv
(
COOC_FN
+
"
_train.csv
"
,
sep
=
"
\t
"
),
pd
.
read_csv
(
COOC_FN
+
"
_test.csv
"
,
sep
=
"
\t
"
)
if
not
"
title
"
in
train_cooc_indices
:
train_cooc_indices
,
test_cooc_indices
=
pd
.
read_csv
(
COOC_FN
+
"
_train.csv
"
),
pd
.
read_csv
(
COOC_FN
+
"
_test.csv
"
)
train_indices
=
train_indices
.
union
(
set
(
train_cooc_indices
.
title
.
apply
(
lambda
x
:
wikipediatitle_id
[
parse_title_wiki
(
x
)]).
values
))
test_indices
=
test_indices
.
union
(
set
(
test_cooc_indices
.
title
.
apply
(
lambda
x
:
wikipediatitle_id
[
parse_title_wiki
(
x
)]).
values
))
logging
.
info
(
"
Merged with Geonames data !
"
)
# EXTRACT rel
logging
.
info
(
"
Extracting cooccurrence relationships
"
)
cpt
=
0
for
ix
,
row
in
tqdm
(
cooc_data
.
iterrows
(),
total
=
len
(
cooc_data
),
desc
=
"
Extracting Wikipedia Cooccurrence
"
):
for
inter
in
np
.
random
.
choice
(
row
.
interlinks
.
split
(
"
|
"
),
COOC_SAMPLING_NUMBER
):
cpt
+=
1
rel_store
.
extend
([[
row
.
geonameid
,
wikipediatitle_id
[
inter
]]])
logging
.
info
(
"
Extract {0} cooccurrence relationships !
"
.
format
(
cpt
))
# STORE ID to name
geoname2name
=
dict
(
filtered
[
"
geonameid name
"
.
split
()].
values
)
# ENCODING NAME USING N-GRAM SPLITTING
logging
.
info
(
"
Encoding toponyms to ngram...
"
)
index
=
NgramIndex
(
NGRAM_SIZE
)
# Identify all ngram available
filtered
.
name
.
apply
(
lambda
x
:
index
.
split_and_add
(
x
))
if
args
.
wikipedia_cooc
:[
index
.
split_and_add
(
k
)
for
k
in
wikipediatitle_id
]
geoname2encodedname
=
{
row
.
geonameid
:
index
.
encode
(
row
.
name
)
for
row
in
filtered
.
itertuples
()}
#init a dict with the 'geonameid' --> 'encoded toponym' association
if
args
.
wikipedia_cooc
:
geoname2encodedname
.
update
({
v
:
index
.
encode
(
k
)
for
k
,
v
in
wikipediatitle_id
.
items
()})
# SAVE THE INDEX TO REUSE THE MODEL
index
.
save
(
INDEX_FN
)
logging
.
info
(
"
Done !
"
)
#############################################################################################
################################# ENCODE COORDINATES ########################################
#############################################################################################
# Encode each geonames entry coordinates
geoname_vec
=
{
row
.
geonameid
:
zero_one_encoding
(
row
.
longitude
,
row
.
latitude
)
for
row
in
filtered
.
itertuples
()}
# CLEAR RAM
del
filtered
EMBEDDING_DIM
=
256
num_words
=
len
(
index
.
index_ngram
)
# necessary for the embedding matrix
logging
.
info
(
"
Preparing Input and Output data...
"
)
#############################################################################################
################################# BUILD TRAIN/TEST DATASETS #################################
#############################################################################################
X_1_train
,
X_2_train
,
y_lat_train
,
y_lon_train
=
[],[],[],[]
X_1_test
,
X_2_test
,
y_lat_test
,
y_lon_test
=
[],[],[],[]
y_train
,
y_test
=
[],[]
for
couple
in
rel_store
:
geonameId_1
,
geonameId_2
=
couple
[
0
],
couple
[
1
]
if
not
geonameId_1
in
geoname2encodedname
:
continue
top1
,
top2
=
geoname2encodedname
[
geonameId_1
],
geoname2encodedname
[
geonameId_2
]
if
geonameId_1
in
train_indices
:
#and geonameId_2 in train_indices:
X_1_train
.
append
(
top1
)
X_2_train
.
append
(
top2
)
y_train
.
append
([
geoname_vec
[
geonameId_1
][
0
],
geoname_vec
[
geonameId_1
][
1
]])
#y_lon_train.append(geoname_vec[geonameId_1][0])
#y_lat_train.append(geoname_vec[geonameId_1][1])
else
:
X_1_test
.
append
(
top1
)
X_2_test
.
append
(
top2
)
y_test
.
append
([
geoname_vec
[
geonameId_1
][
0
],
geoname_vec
[
geonameId_1
][
1
]])
#y_lon_test.append(geoname_vec[geonameId_1][0])
#y_lat_test.append(geoname_vec[geonameId_1][1])
# NUMPYZE inputs and output lists
X_1_train
=
np
.
array
(
X_1_train
)
X_2_train
=
np
.
array
(
X_2_train
)
y_lat_train
=
np
.
array
(
y_lat_train
)
y_lon_train
=
np
.
array
(
y_lon_train
)
y_train
=
np
.
array
(
y_train
)
X_1_test
=
np
.
array
(
X_1_test
)
X_2_test
=
np
.
array
(
X_2_test
)
y_lat_test
=
np
.
array
(
y_lat_test
)
y_lon_test
=
np
.
array
(
y_lon_test
)
y_test
=
np
.
array
(
y_test
)
logging
.
info
(
"
Data prepared !
"
)
# check for output dir
if
not
os
.
path
.
exists
(
"
outputs/
"
):
os
.
makedirs
(
"
outputs/
"
)
#############################################################################################
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
#############################################################################################
embedding_weights
=
load_embedding
(
args
.
embedding_fn
)
EMBEDDING_DIM
=
len
(
embedding_weights
[
0
])
logging
.
info
(
"
Generating N-GRAM Embedding...
"
)
embedding_weights
=
index
.
get_embedding_layer
(
geoname2encodedname
.
values
(),
dim
=
EMBEDDING_DIM
,
iter
=
WORDVEC_ITER
)
logging
.
info
(
"
Embedding generated !
"
)
#############################################################################################
#############################################################################################
################################# MODEL DEFINITION ##########################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
#############################################################################################
from
keras
import
regularizers
####
input_1
=
Input
(
shape
=
(
index
.
max_len
,))
input_1
=
Input
(
shape
=
(
index
.
max_len
,))
input_2
=
Input
(
shape
=
(
index
.
max_len
,))
input_2
=
Input
(
shape
=
(
index
.
max_len
,))
embedding_layer
=
Embedding
(
num_words
,
EMBEDDING_DIM
,
input_length
=
index
.
max_len
,
trainable
=
False
)
#, trainable=True)
embedding_layer
=
Embedding
(
num_words
,
EMBEDDING_DIM
,
input_length
=
index
.
max_len
,
weights
=
[
embedding_weights
],
trainable
=
False
)
#, trainable=True)
x1
=
embedding_layer
(
input_1
)
x1
=
embedding_layer
(
input_1
)
x2
=
embedding_layer
(
input_2
)
x2
=
embedding_layer
(
input_2
)
# Each LSTM learn on a permutation of the input toponyms
# Each LSTM learn on a permutation of the input toponyms
biLSTM
=
Bidirectional
(
GRU
(
128
,
activation
=
"
pentanh
"
,
recurrent_activation
=
"
pentanh
"
))
x1
=
Bidirectional
(
LSTM
(
98
))(
x1
)
x1
=
biLSTM
(
x1
)
x2
=
Bidirectional
(
LSTM
(
98
))(
x2
)
x2
=
biLSTM
(
x2
)
x
=
concatenate
([
x1
,
x2
])
#,x3])
x
=
concatenate
([
x1
,
x2
])
#,x3])
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x1
=
Dropout
(
0.3
)(
x1
)
#
x1 = Dropout(0.3)(x1)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
x1
=
Dropout
(
0.3
)(
x1
)
#
x1 = Dropout(0.3)(x1)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x2
=
Dropout
(
0.3
)(
x2
)
#
x2 = Dropout(0.3)(x2)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
x2
=
Dropout
(
0.3
)(
x2
)
# x2 = Dropout(0.3)(x2)
#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
)(
x1
)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
)(
x2
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
output_coord
=
concatenate
([
output_lon
,
output_lat
],
name
=
"
output_coord
"
)
output_coord
=
concatenate
([
output_lon
,
output_lat
],
name
=
"
output_coord
"
)
#####
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
output_coord
)
#input_3
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
output_coord
)
#input_3
model
.
compile
(
loss
=
{
"
output_coord
"
:
haversine_tf_1circle
},
optimizer
=
'
adam
'
,
metrics
=
{
"
output_coord
"
:
accuracy_k
(
ACCURACY_TOLERANCE
)})
model
.
compile
(
loss
=
{
"
output_coord
"
:
haversine_tf_1circle
},
optimizer
=
'
adam
'
,
metrics
=
{
"
output_coord
"
:
accuracy_k
(
ACCURACY_TOLERANCE
)})
model
.
summary
()
# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
#############################################################################################
#############################################################################################
################################# TRAINING LAUNCH ###########################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
#############################################################################################
...
@@ -199,10 +382,11 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=
...
@@ -199,10 +382,11 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=
epoch_timer
=
EpochTimer
(
"
outputs/
"
+
PREFIX_OUTPUT_FN
+
"
_epoch_timer_output.csv
"
)
epoch_timer
=
EpochTimer
(
"
outputs/
"
+
PREFIX_OUTPUT_FN
+
"
_epoch_timer_output.csv
"
)
history
=
model
.
fit
_generator
(
generator
=
d
_train
,
history
=
model
.
fit
(
x
=
[
X_1_train
,
X_2
_train
]
,
validation_data
=
d_test
,
y
=
y_train
,
#[y_lon_train,y_lat_train]
,
verbose
=
True
,
verbose
=
True
,
batch_size
=
100
,
epochs
=
EPOCHS
,
epochs
=
EPOCHS
,
validation_data
=
([
X_1_test
,
X_2_test
],
y_test
),
#[y_lon_test,y_lat_test]),
callbacks
=
[
checkpoint
,
epoch_timer
])
callbacks
=
[
checkpoint
,
epoch_timer
])
...
@@ -213,4 +397,5 @@ model.save(MODEL_OUTPUT_FN)
...
@@ -213,4 +397,5 @@ model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
# Erase Model Checkpoint file
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
os
.
remove
(
MODEL_OUTPUT_FN
+
"
.part
"
)
import
shutil
\ No newline at end of file
shutil
.
rmtree
(
MODEL_OUTPUT_FN
+
"
.part
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
parser_config/toponym_combination_embedding.json
+
1
−
1
View file @
5ea67e5e
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
{
"long"
:
"--adjacency-iteration"
,
"type"
:
"int"
,
"default"
:
1
},
{
"long"
:
"--adjacency-iteration"
,
"type"
:
"int"
,
"default"
:
1
},
{
"short"
:
"-n"
,
"long"
:
"--ngram-size"
,
"type"
:
"int"
,
"default"
:
2
},
{
"short"
:
"-n"
,
"long"
:
"--ngram-size"
,
"type"
:
"int"
,
"default"
:
2
},
{
"long"
:
"--ngram-word2vec-iter"
,
"type"
:
"int"
,
"default"
:
50
},
{
"long"
:
"--ngram-word2vec-iter"
,
"type"
:
"int"
,
"default"
:
50
},
{
"short"
:
"-t"
,
"long"
:
"--tolerance-value"
,
"type"
:
"float"
,
"default"
:
0.
00
2
},
{
"short"
:
"-t"
,
"long"
:
"--tolerance-value"
,
"type"
:
"float"
,
"default"
:
1
00
},
{
"short"
:
"-e"
,
"long"
:
"--epochs"
,
"type"
:
"int"
,
"default"
:
100
},
{
"short"
:
"-e"
,
"long"
:
"--epochs"
,
"type"
:
"int"
,
"default"
:
100
},
{
"short"
:
"-d"
,
"long"
:
"--dimension"
,
"type"
:
"int"
,
"default"
:
256
},
{
"short"
:
"-d"
,
"long"
:
"--dimension"
,
"type"
:
"int"
,
"default"
:
256
},
{
"long"
:
"--admin_code_1"
,
"default"
:
"None"
}
{
"long"
:
"--admin_code_1"
,
"default"
:
"None"
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment