Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Toponym Geocoding
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jacques Fize
Toponym Geocoding
Commits
9d554416
Commit
9d554416
authored
5 years ago
by
Jacques Fize
Browse files
Options
Downloads
Patches
Plain Diff
Add new learning script that enable to work with big data
parent
12946ad6
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
combination_embeddingsv2.py
+178
-0
178 additions, 0 deletions
combination_embeddingsv2.py
data_generator.py
+4
-2
4 additions, 2 deletions
data_generator.py
parser_config/toponym_combination_embedding_v2.json
+21
-0
21 additions, 0 deletions
parser_config/toponym_combination_embedding_v2.json
with
203 additions
and
2 deletions
combination_embeddingsv2.py
0 → 100644
+
178
−
0
View file @
9d554416
# Base module
import
os
# Structure
import
pandas
as
pd
# DEEPL module
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
from
keras.models
import
Model
from
keras.callbacks
import
ModelCheckpoint
# Custom module
from
lib.ngram_index
import
NgramIndex
from
lib.utils
import
ConfigurationReader
,
MetaDataSerializer
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
data_generator
import
DataGenerator
,
CoOccurrences
,
load_embedding
# Logging
import
logging
logging
.
getLogger
(
'
gensim
'
).
setLevel
(
logging
.
WARNING
)
# LOGGING CONF
logging
.
basicConfig
(
format
=
'
[%(asctime)s][%(levelname)s] %(message)s
'
,
datefmt
=
'
%m/%d/%Y %I:%M:%S %p
'
,
level
=
logging
.
INFO
)
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding_v2.json
"
)
\
.
parse_args
()
#("-w -e 100 ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE
=
args
.
ngram_size
ACCURACY_TOLERANCE
=
args
.
tolerance_value
EPOCHS
=
args
.
epochs
ITER_ADJACENCY
=
args
.
adjacency_iteration
COOC_SAMPLING_NUMBER
=
args
.
cooc_sample_size
WORDVEC_ITER
=
args
.
ngram_word2vec_iter
EMBEDDING_DIM
=
100
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if
not
os
.
path
.
exists
(
"
outputs/
"
):
os
.
makedirs
(
"
outputs/
"
)
GEONAME_FN
=
"
ALL
"
#args.geoname_input
DATASET_NAME
=
"
ALL
"
#args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN
=
""
#args.geoname_hierachy_input
REGION_SUFFIX_FN
=
""
if
args
.
admin_code_1
==
"
None
"
else
"
_
"
+
args
.
admin_code_1
ADJACENCY_REL_FILENAME
=
"
{0}_{1}{2}adjacency.json
"
.
format
(
GEONAME_FN
,
ITER_ADJACENCY
,
REGION_SUFFIX_FN
)
COOC_FN
=
args
.
wikipedia_cooc_fn
PREFIX_OUTPUT_FN
=
"
{0}_{1}_{2}_{3}_{4}
"
.
format
(
GEONAME_FN
.
split
(
"
/
"
)[
-
1
],
EPOCHS
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
,
REGION_SUFFIX_FN
)
REL_CODE
=
""
if
args
.
adjacency
:
PREFIX_OUTPUT_FN
+=
"
_A
"
REL_CODE
+=
"
A
"
if
args
.
inclusion
:
PREFIX_OUTPUT_FN
+=
"
_I
"
REL_CODE
+=
"
I
"
if
args
.
wikipedia_cooc
:
PREFIX_OUTPUT_FN
+=
"
_C
"
REL_CODE
+=
"
C
"
MODEL_OUTPUT_FN
=
"
outputs/{0}.h5
"
.
format
(
PREFIX_OUTPUT_FN
)
INDEX_FN
=
"
outputs/{0}_index
"
.
format
(
PREFIX_OUTPUT_FN
)
HISTORY_FN
=
"
outputs/{0}.csv
"
.
format
(
PREFIX_OUTPUT_FN
)
meta_data
=
MetaDataSerializer
(
DATASET_NAME
,
REL_CODE
,
COOC_SAMPLING_NUMBER
,
ITER_ADJACENCY
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
,
EPOCHS
,
EMBEDDING_DIM
,
WORDVEC_ITER
,
INDEX_FN
,
MODEL_OUTPUT_FN
,
HISTORY_FN
)
meta_data
.
save
(
"
outputs/{0}.json
"
.
format
(
PREFIX_OUTPUT_FN
))
### PUT DATASRC + GENERATOR
index
=
NgramIndex
.
load
(
args
.
ngram_index_fn
)
c_train
=
CoOccurrences
(
COOC_FN
+
"
_train.csv
"
,
sampling
=
3
)
c_test
=
CoOccurrences
(
COOC_FN
+
"
_test.csv
"
,
sampling
=
3
)
BATCH_SIZE
=
1000
d_train
=
DataGenerator
([
c_train
],
index
,
batch_size
=
BATCH_SIZE
)
d_test
=
DataGenerator
([
c_test
],
index
,
batch_size
=
BATCH_SIZE
)
num_words
=
len
(
index
.
index_ngram
)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights
=
load_embedding
(
args
.
embedding_fn
)
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
input_1
=
Input
(
shape
=
(
index
.
max_len
,))
input_2
=
Input
(
shape
=
(
index
.
max_len
,))
embedding_layer
=
Embedding
(
num_words
,
EMBEDDING_DIM
,
input_length
=
index
.
max_len
,
weights
=
[
embedding_weights
],
trainable
=
False
)
#, trainable=True)
x1
=
embedding_layer
(
input_1
)
x2
=
embedding_layer
(
input_2
)
# Each LSTM learn on a permutation of the input toponyms
x1
=
Bidirectional
(
LSTM
(
98
))(
x1
)
x2
=
Bidirectional
(
LSTM
(
98
))(
x2
)
x
=
concatenate
([
x1
,
x2
])
#,x3])
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
# x1 = Dropout(0.3)(x1)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
# x1 = Dropout(0.3)(x1)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
# x2 = Dropout(0.3)(x2)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
# x2 = Dropout(0.3)(x2)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
[
output_lon
,
output_lat
])
#input_3
model
.
compile
(
loss
=
[
'
mean_squared_error
'
,
'
mean_squared_error
'
],
optimizer
=
'
rmsprop
'
,
metrics
=
{
"
Output_LON
"
:
lon_accuracy
(),
"
Output_LAT
"
:
lat_accuracy
()})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint
=
ModelCheckpoint
(
MODEL_OUTPUT_FN
+
"
.part
"
,
monitor
=
'
loss
'
,
verbose
=
1
,
save_best_only
=
True
,
mode
=
'
auto
'
,
period
=
1
)
history
=
model
.
fit_generator
(
generator
=
d_train
,
validation_data
=
d_test
,
verbose
=
True
,
epochs
=
EPOCHS
,
callbacks
=
[
checkpoint
])
hist_df
=
pd
.
DataFrame
(
history
.
history
)
hist_df
.
to_csv
(
HISTORY_FN
)
model
.
save
(
MODEL_OUTPUT_FN
)
# Erase Model Checkpoint file
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
os
.
remove
(
MODEL_OUTPUT_FN
+
"
.part
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
data_generator.py
+
4
−
2
View file @
9d554416
...
@@ -5,6 +5,8 @@ import keras
...
@@ -5,6 +5,8 @@ import keras
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
from
lib.geo
import
zero_one_encoding
from
helpers
import
parse_title_wiki
,
read_geonames
from
helpers
import
parse_title_wiki
,
read_geonames
from
gensim.models.keyedvectors
import
KeyedVectors
from
gensim.models.keyedvectors
import
KeyedVectors
...
@@ -267,8 +269,8 @@ class DataGenerator(keras.utils.Sequence):
...
@@ -267,8 +269,8 @@ class DataGenerator(keras.utils.Sequence):
return
X
,
y
return
X
,
y
X
[
i
]
=
[
self
.
ngram_index
.
encode
(
topo
),
self
.
ngram_index
.
encode
(
topo_context
)]
X
[
i
]
=
[
self
.
ngram_index
.
encode
(
topo
),
self
.
ngram_index
.
encode
(
topo_context
)]
y
[
i
]
=
[
longitude
,
latitude
]
y
[
i
]
=
[
*
zero_one_encoding
(
longitude
,
latitude
)
]
return
X
,
y
return
[
X
[:,
0
],
X
[:,
1
]],
[
y
[:,
0
],
y
[:,
1
]]
def
on_epoch_end
(
self
):
def
on_epoch_end
(
self
):
'
Updates indexes after each epoch
'
'
Updates indexes after each epoch
'
...
...
This diff is collapsed.
Click to expand it.
parser_config/toponym_combination_embedding_v2.json
0 → 100644
+
21
−
0
View file @
9d554416
{
"description"
:
"Toponym Combination"
,
"args"
:
[
{
"short"
:
"ngram_index_fn"
,
"help"
:
"Filepath of the NgramIndex file you want to use."
},
{
"short"
:
"embedding_fn"
,
"help"
:
"Filepath of the Embedding file you want to use."
},
{
"short"
:
"-v"
,
"long"
:
"--verbose"
,
"action"
:
"store_true"
},
{
"short"
:
"-i"
,
"long"
:
"--inclusion"
,
"action"
:
"store_true"
},
{
"short"
:
"-a"
,
"long"
:
"--adjacency"
,
"action"
:
"store_true"
},
{
"short"
:
"-w"
,
"long"
:
"--wikipedia-cooc"
,
"action"
:
"store_true"
},
{
"long"
:
"--wikipedia-cooc-fn"
,
"help"
:
"Cooccurrence data filename"
},
{
"long"
:
"--adjacency-fn"
,
"help"
:
"Adjacency data filename"
},
{
"long"
:
"--cooc-sample-size"
,
"type"
:
"int"
,
"default"
:
3
},
{
"long"
:
"--adjacency-iteration"
,
"type"
:
"int"
,
"default"
:
1
},
{
"short"
:
"-n"
,
"long"
:
"--ngram-size"
,
"type"
:
"int"
,
"default"
:
2
},
{
"long"
:
"--ngram-word2vec-iter"
,
"type"
:
"int"
,
"default"
:
50
},
{
"short"
:
"-t"
,
"long"
:
"--tolerance-value"
,
"type"
:
"float"
,
"default"
:
0.002
},
{
"short"
:
"-e"
,
"long"
:
"--epochs"
,
"type"
:
"int"
,
"default"
:
100
},
{
"short"
:
"-d"
,
"long"
:
"--dimension"
,
"type"
:
"int"
,
"default"
:
256
},
{
"long"
:
"--admin_code_1"
,
"default"
:
"None"
}
]
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment