Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Toponym Geocoding
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jacques Fize
Toponym Geocoding
Commits
1576e3e5
Commit
1576e3e5
authored
4 years ago
by
Jacques Fize
Browse files
Options
Downloads
Patches
Plain Diff
Change files organisation
parent
6fe5e31c
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
combination_embeddingsv2.py
+0
-209
0 additions, 209 deletions
combination_embeddingsv2.py
combination_embeddingsv3.py
+0
-216
0 additions, 216 deletions
combination_embeddingsv3.py
scripts/get_all_adjacency_rel.py
+0
-0
0 additions, 0 deletions
scripts/get_all_adjacency_rel.py
with
0 additions
and
425 deletions
combination_embeddingsv2.py
deleted
100644 → 0
+
0
−
209
View file @
6fe5e31c
# Base module
import
os
# Structure
import
pandas
as
pd
import
numpy
as
np
# DEEPL module
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
,
Dropout
,
GRU
from
keras.models
import
Model
from
keras.callbacks
import
ModelCheckpoint
from
tensorflow.keras.layers
import
Lambda
import
keras.backend
as
K
import
tensorflow
as
tf
from
lib.custom_layer
import
*
# Custom module
from
lib.ngram_index
import
NgramIndex
from
lib.utils
import
ConfigurationReader
,
MetaDataSerializer
,
LabelEncoder
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.data_generator
import
DataGenerator
,
CoOccurrences
,
load_embedding
,
Inclusion
,
Adjacency
from
lib.geo
import
haversine_tf
,
accuracy_k
,
haversine_tf_1circle
# Logging
import
logging
logging
.
getLogger
(
'
gensim
'
).
setLevel
(
logging
.
WARNING
)
from
helpers
import
EpochTimer
# LOGGING CONF
logging
.
basicConfig
(
format
=
'
[%(asctime)s][%(levelname)s] %(message)s
'
,
datefmt
=
'
%m/%d/%Y %I:%M:%S %p
'
,
level
=
logging
.
INFO
)
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding_v2.json
"
)
\
.
parse_args
()
#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE
=
args
.
ngram_size
ACCURACY_TOLERANCE
=
args
.
k_value
EPOCHS
=
args
.
epochs
ADJACENCY_SAMPLING
=
args
.
adjacency_sample
COOC_SAMPLING
=
args
.
cooc_sample
WORDVEC_ITER
=
50
EMBEDDING_DIM
=
args
.
dimension
BATCH_SIZE
=
args
.
batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if
not
os
.
path
.
exists
(
"
outputs/
"
):
os
.
makedirs
(
"
outputs/
"
)
GEONAME_FN
=
args
.
geoname_input
DATASET_NAME
=
args
.
geoname_input
.
split
(
"
/
"
)[
-
1
]
GEONAMES_HIERARCHY_FN
=
args
.
inclusion_fn
ADJACENCY_REL_FILENAME
=
args
.
adjacency_fn
COOC_FN
=
args
.
wikipedia_cooc_fn
PREFIX_OUTPUT_FN
=
"
{0}_{1}_{2}_{3}
"
.
format
(
GEONAME_FN
.
split
(
"
/
"
)[
-
1
],
EPOCHS
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
)
REL_CODE
=
""
if
args
.
adjacency
:
PREFIX_OUTPUT_FN
+=
"
_A
"
REL_CODE
+=
"
A
"
if
args
.
inclusion
:
PREFIX_OUTPUT_FN
+=
"
_I
"
REL_CODE
+=
"
I
"
if
args
.
wikipedia_cooc
:
PREFIX_OUTPUT_FN
+=
"
_C
"
REL_CODE
+=
"
C
"
MODEL_OUTPUT_FN
=
"
outputs/{0}.h5
"
.
format
(
PREFIX_OUTPUT_FN
)
INDEX_FN
=
"
outputs/{0}_index
"
.
format
(
PREFIX_OUTPUT_FN
)
HISTORY_FN
=
"
outputs/{0}.csv
"
.
format
(
PREFIX_OUTPUT_FN
)
meta_data
=
MetaDataSerializer
(
DATASET_NAME
,
REL_CODE
,
COOC_SAMPLING
,
ADJACENCY_SAMPLING
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
,
EPOCHS
,
EMBEDDING_DIM
,
WORDVEC_ITER
,
INDEX_FN
,
MODEL_OUTPUT_FN
,
HISTORY_FN
)
meta_data
.
save
(
"
outputs/{0}.json
"
.
format
(
PREFIX_OUTPUT_FN
))
### PUT DATASRC + GENERATOR
index
=
NgramIndex
.
load
(
args
.
ngram_index_fn
)
train_src
=
[]
test_src
=
[]
class_encoder
=
LabelEncoder
()
if
args
.
wikipedia_cooc
:
train_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_train.csv
"
,
class_encoder
,
sampling
=
4
))
test_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_test.csv
"
,
class_encoder
,
sampling
=
4
))
if
args
.
adjacency
:
a_train
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_train.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
a_test
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_test.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
train_src
.
append
(
a_train
)
test_src
.
append
(
a_test
)
if
args
.
inclusion
:
i_train
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_train.csv
"
)
i_test
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_test.csv
"
)
train_src
.
append
(
i_train
)
test_src
.
append
(
i_test
)
#Adjacency
d_train
=
DataGenerator
(
train_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
d_test
=
DataGenerator
(
test_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
num_words
=
len
(
index
.
index_ngram
)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights
=
load_embedding
(
args
.
embedding_fn
)
EMBEDDING_DIM
=
len
(
embedding_weights
[
0
])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from
keras
import
regularizers
input_1
=
Input
(
shape
=
(
index
.
max_len
,))
input_2
=
Input
(
shape
=
(
index
.
max_len
,))
embedding_layer
=
Embedding
(
num_words
,
EMBEDDING_DIM
,
input_length
=
index
.
max_len
,
trainable
=
False
)
#, trainable=True)
x1
=
embedding_layer
(
input_1
)
x2
=
embedding_layer
(
input_2
)
# Each LSTM learn on a permutation of the input toponyms
biLSTM
=
Bidirectional
(
LSTM
(
64
,
activation
=
"
pentanh
"
,
recurrent_activation
=
"
pentanh
"
))
x1
=
biLSTM
(
x1
)
x2
=
biLSTM
(
x2
)
x
=
concatenate
([
x2
,
x1
])
#,x3])
x1
=
Dense
(
1000
,
activation
=
"
pentanh
"
)(
x
)
# x1 = Dropout(0.3)(x1)
x1
=
Dense
(
1000
,
activation
=
"
pentanh
"
)(
x1
)
# x1 = Dropout(0.3)(x1)
x2
=
Dense
(
1000
,
activation
=
"
pentanh
"
)(
x
)
# x2 = Dropout(0.3)(x2)
x2
=
Dense
(
1000
,
activation
=
"
pentanh
"
)(
x2
)
# x2 = Dropout(0.3)(x2)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
output
=
concatenate
([
output_lon
,
output_lat
],
name
=
"
output_layer
"
)
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
output
)
#input_3
model
.
compile
(
loss
=
{
"
output_layer
"
:
haversine_tf_1circle
},
optimizer
=
'
adam
'
,
metrics
=
{
"
output_layer
"
:
accuracy_k
(
ACCURACY_TOLERANCE
)})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint
=
ModelCheckpoint
(
MODEL_OUTPUT_FN
+
"
.part
"
,
monitor
=
'
loss
'
,
verbose
=
1
,
save_best_only
=
True
,
mode
=
'
auto
'
,
period
=
1
)
epoch_timer
=
EpochTimer
(
"
outputs/
"
+
PREFIX_OUTPUT_FN
+
"
_epoch_timer_output.csv
"
)
history
=
model
.
fit_generator
(
generator
=
d_train
,
validation_data
=
d_test
,
verbose
=
True
,
epochs
=
EPOCHS
,
callbacks
=
[
checkpoint
,
epoch_timer
])
hist_df
=
pd
.
DataFrame
(
history
.
history
)
hist_df
.
to_csv
(
HISTORY_FN
)
model
.
save
(
MODEL_OUTPUT_FN
)
# Erase Model Checkpoint file
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
os
.
remove
(
MODEL_OUTPUT_FN
+
"
.part
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
combination_embeddingsv3.py
deleted
100644 → 0
+
0
−
216
View file @
6fe5e31c
# Base module
import
os
# Structure
import
pandas
as
pd
import
numpy
as
np
# DEEPL module
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
,
Dropout
,
GRU
from
keras.models
import
Model
from
keras.callbacks
import
ModelCheckpoint
from
tensorflow.keras.layers
import
Lambda
import
keras.backend
as
K
import
tensorflow
as
tf
from
lib.custom_layer
import
*
# Custom module
from
lib.ngram_index
import
NgramIndex
from
lib.utils
import
ConfigurationReader
,
MetaDataSerializer
,
LabelEncoder
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.data_generator
import
DataGenerator
,
CoOccurrences
,
load_embedding
,
Inclusion
,
Adjacency
from
lib.geo
import
haversine_tf
,
accuracy_k
,
haversine_tf_1circle
# Logging
import
logging
logging
.
getLogger
(
'
gensim
'
).
setLevel
(
logging
.
WARNING
)
from
helpers
import
EpochTimer
# LOGGING CONF
logging
.
basicConfig
(
format
=
'
[%(asctime)s][%(levelname)s] %(message)s
'
,
datefmt
=
'
%m/%d/%Y %I:%M:%S %p
'
,
level
=
logging
.
INFO
)
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding_v2.json
"
)
\
.
parse_args
()
#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE
=
args
.
ngram_size
ACCURACY_TOLERANCE
=
args
.
k_value
EPOCHS
=
args
.
epochs
ADJACENCY_SAMPLING
=
args
.
adjacency_sample
COOC_SAMPLING
=
args
.
cooc_sample
WORDVEC_ITER
=
50
EMBEDDING_DIM
=
args
.
dimension
BATCH_SIZE
=
args
.
batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if
not
os
.
path
.
exists
(
"
outputs/
"
):
os
.
makedirs
(
"
outputs/
"
)
GEONAME_FN
=
args
.
geoname_input
DATASET_NAME
=
args
.
geoname_input
.
split
(
"
/
"
)[
-
1
]
GEONAMES_HIERARCHY_FN
=
args
.
inclusion_fn
ADJACENCY_REL_FILENAME
=
args
.
adjacency_fn
COOC_FN
=
args
.
wikipedia_cooc_fn
PREFIX_OUTPUT_FN
=
"
{0}_{1}_{2}_{3}
"
.
format
(
GEONAME_FN
.
split
(
"
/
"
)[
-
1
],
EPOCHS
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
)
REL_CODE
=
""
if
args
.
adjacency
:
PREFIX_OUTPUT_FN
+=
"
_A
"
REL_CODE
+=
"
A
"
if
args
.
inclusion
:
PREFIX_OUTPUT_FN
+=
"
_I
"
REL_CODE
+=
"
I
"
if
args
.
wikipedia_cooc
:
PREFIX_OUTPUT_FN
+=
"
_C
"
REL_CODE
+=
"
C
"
MODEL_OUTPUT_FN
=
"
outputs/{0}.h5
"
.
format
(
PREFIX_OUTPUT_FN
)
INDEX_FN
=
"
outputs/{0}_index
"
.
format
(
PREFIX_OUTPUT_FN
)
HISTORY_FN
=
"
outputs/{0}.csv
"
.
format
(
PREFIX_OUTPUT_FN
)
meta_data
=
MetaDataSerializer
(
DATASET_NAME
,
REL_CODE
,
COOC_SAMPLING
,
ADJACENCY_SAMPLING
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
,
EPOCHS
,
EMBEDDING_DIM
,
WORDVEC_ITER
,
INDEX_FN
,
MODEL_OUTPUT_FN
,
HISTORY_FN
)
meta_data
.
save
(
"
outputs/{0}.json
"
.
format
(
PREFIX_OUTPUT_FN
))
### PUT DATASRC + GENERATOR
index
=
NgramIndex
.
load
(
args
.
ngram_index_fn
)
train_src
=
[]
test_src
=
[]
class_encoder
=
LabelEncoder
()
if
args
.
wikipedia_cooc
:
train_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_train.csv
"
,
class_encoder
,
sampling
=
4
,
use_healpix
=
False
))
test_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_test.csv
"
,
class_encoder
,
sampling
=
4
,
use_healpix
=
False
))
if
args
.
adjacency
:
a_train
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_train.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
a_test
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_test.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
train_src
.
append
(
a_train
)
test_src
.
append
(
a_test
)
if
args
.
inclusion
:
i_train
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_train.csv
"
)
i_test
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_test.csv
"
)
train_src
.
append
(
i_train
)
test_src
.
append
(
i_test
)
#Adjacency
print
(
"
Number of classes:
"
,
class_encoder
.
get_num_classes
())
d_train
=
DataGenerator
(
train_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
d_test
=
DataGenerator
(
test_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
num_words
=
len
(
index
.
index_ngram
)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights
=
load_embedding
(
args
.
embedding_fn
)
EMBEDDING_DIM
=
len
(
embedding_weights
[
0
])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from
keras
import
regularizers
####
input_1
=
Input
(
shape
=
(
index
.
max_len
,))
input_2
=
Input
(
shape
=
(
index
.
max_len
,))
embedding_layer
=
Embedding
(
num_words
,
EMBEDDING_DIM
,
input_length
=
index
.
max_len
,
trainable
=
False
)
#, trainable=True)
x1
=
embedding_layer
(
input_1
)
x2
=
embedding_layer
(
input_2
)
# Each LSTM learn on a permutation of the input toponyms
biLSTM
=
Bidirectional
(
GRU
(
128
,
activation
=
"
pentanh
"
,
recurrent_activation
=
"
pentanh
"
))
x1
=
biLSTM
(
x1
)
x2
=
biLSTM
(
x2
)
x
=
concatenate
([
x1
,
x2
])
#,x3])
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x1
=
Dropout
(
0.3
)(
x1
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
x1
=
Dropout
(
0.3
)(
x1
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x2
=
Dropout
(
0.3
)(
x2
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
x2
=
Dropout
(
0.3
)(
x2
)
#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
)(
x2
)
output_coord
=
concatenate
([
output_lon
,
output_lat
],
name
=
"
output_coord
"
)
#####
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
output_coord
)
#input_3
model
.
compile
(
loss
=
{
"
output_coord
"
:
haversine_tf_1circle
},
optimizer
=
'
adam
'
,
metrics
=
{
"
output_coord
"
:
accuracy_k
(
ACCURACY_TOLERANCE
)})
model
.
summary
()
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint
=
ModelCheckpoint
(
MODEL_OUTPUT_FN
+
"
.part
"
,
monitor
=
'
loss
'
,
verbose
=
1
,
save_best_only
=
True
,
mode
=
'
auto
'
,
period
=
1
)
epoch_timer
=
EpochTimer
(
"
outputs/
"
+
PREFIX_OUTPUT_FN
+
"
_epoch_timer_output.csv
"
)
history
=
model
.
fit_generator
(
generator
=
d_train
,
validation_data
=
d_test
,
verbose
=
True
,
epochs
=
EPOCHS
,
callbacks
=
[
checkpoint
,
epoch_timer
])
hist_df
=
pd
.
DataFrame
(
history
.
history
)
hist_df
.
to_csv
(
HISTORY_FN
)
model
.
save
(
MODEL_OUTPUT_FN
)
# Erase Model Checkpoint file
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
os
.
remove
(
MODEL_OUTPUT_FN
+
"
.part
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
get_all_adjacency_rel.py
→
scripts/
get_all_adjacency_rel.py
+
0
−
0
View file @
1576e3e5
File moved
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment