Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Toponym Geocoding
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jacques Fize
Toponym Geocoding
Commits
c1530d9e
Commit
c1530d9e
authored
5 years ago
by
Jacques Fize
Browse files
Options
Downloads
Patches
Plain Diff
DEBUG
parent
b648cf9e
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
.gitignore
+5
-5
5 additions, 5 deletions
.gitignore
combination_embeddings.py
+18
-15
18 additions, 15 deletions
combination_embeddings.py
helpers.py
+67
-51
67 additions, 51 deletions
helpers.py
parser_config/toponym_combination_embedding.json
+1
-1
1 addition, 1 deletion
parser_config/toponym_combination_embedding.json
with
91 additions
and
72 deletions
.gitignore
+
5
−
5
View file @
c1530d9e
...
@@ -134,17 +134,17 @@ data/*
...
@@ -134,17 +134,17 @@ data/*
deprecated/*
deprecated/*
*.ipynb_checkpoints
*.ipynb_checkpoints
notebooks/*
notebooks/*
outputs
/
*
outputs*
temp/*
temp/*
WikipediaExtract/*
WikipediaExtract/*
*.DS_Store
*.DS_Store
test_comb.sh
test_comb.sh
.vscode
/
*
.vscode*
notes.md
notes.md
.idea/*
.idea*
.vscode/*
other/*
other/*
test*
test*
\ No newline at end of file
nohup.out
\ No newline at end of file
This diff is collapsed.
Click to expand it.
combination_embeddings.py
+
18
−
15
View file @
c1530d9e
...
@@ -9,7 +9,7 @@ import numpy as np
...
@@ -9,7 +9,7 @@ import numpy as np
import
geopandas
as
gpd
import
geopandas
as
gpd
# DEEPL module
# DEEPL module
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
,
Dropout
from
keras.models
import
Model
from
keras.models
import
Model
from
keras
import
backend
as
K
from
keras
import
backend
as
K
from
keras.callbacks
import
ModelCheckpoint
from
keras.callbacks
import
ModelCheckpoint
...
@@ -79,7 +79,7 @@ ACCURACY_TOLERANCE = args.tolerance_value
...
@@ -79,7 +79,7 @@ ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS
=
args
.
epochs
EPOCHS
=
args
.
epochs
ITER_ADJACENCY
=
args
.
adjacency_iteration
ITER_ADJACENCY
=
args
.
adjacency_iteration
COOC_SAMPLING_NUMBER
=
args
.
cooc_sample_size
COOC_SAMPLING_NUMBER
=
args
.
cooc_sample_size
WORDVEC_ITER
=
args
.
ngram_word2vec_
dim
WORDVEC_ITER
=
args
.
ngram_word2vec_
iter
#################################################
#################################################
########## FILENAME VARIABLE ####################
########## FILENAME VARIABLE ####################
#################################################
#################################################
...
@@ -220,7 +220,7 @@ logging.info("Done !")
...
@@ -220,7 +220,7 @@ logging.info("Done !")
#############################################################################################
#############################################################################################
################################# ENCODE COORDINATES ########################################
#########
################################# ENCODE COORDINATES ########################################
#############################################################################################
#############################################################################################
...
@@ -301,20 +301,24 @@ input_2 = Input(shape=(index.max_len,))
...
@@ -301,20 +301,24 @@ input_2 = Input(shape=(index.max_len,))
embedding_layer
=
Embedding
(
num_words
,
embedding_dim
,
input_length
=
index
.
max_len
,
weights
=
[
embedding_weights
],
trainable
=
False
)
#, trainable=True)
embedding_layer
=
Embedding
(
num_words
,
embedding_dim
,
input_length
=
index
.
max_len
,
weights
=
[
embedding_weights
],
trainable
=
False
)
#, trainable=True)
x1
=
Bidirectional
(
LSTM
(
98
))(
embedding_layer
(
input_1
))
x1
=
embedding_layer
(
input_1
)
x2
=
Bidirectional
(
LSTM
(
98
))(
embedding_layer
(
input_2
))
x2
=
embedding_layer
(
input_2
)
# Each LSTM learn on a permutation of the input toponyms
x1
=
Bidirectional
(
LSTM
(
98
))(
x1
)
x2
=
Bidirectional
(
LSTM
(
98
))(
x2
)
x
=
concatenate
([
x1
,
x2
])
#,x3])
x
=
concatenate
([
x1
,
x2
])
#,x3])
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
#
x1 = Dropout(0.3)(x1)
x1
=
Dropout
(
0.3
)(
x1
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
#
x1 = Dropout(0.3)(x1)
x1
=
Dropout
(
0.3
)(
x1
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
#
x2 = Dropout(0.3)(x2)
x2
=
Dropout
(
0.3
)(
x2
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
#
x2 = Dropout(0.3)(x2)
x2
=
Dropout
(
0.3
)(
x2
)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LON
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
,
name
=
"
Output_LAT
"
)(
x2
)
...
@@ -324,14 +328,13 @@ model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#inp
...
@@ -324,14 +328,13 @@ model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#inp
model
.
compile
(
loss
=
[
'
mean_squared_error
'
,
'
mean_squared_error
'
],
optimizer
=
'
adam
'
,
metrics
=
{
"
Output_LON
"
:
lon_accuracy
(),
"
Output_LAT
"
:
lat_accuracy
()})
model
.
compile
(
loss
=
[
'
mean_squared_error
'
,
'
mean_squared_error
'
],
optimizer
=
'
adam
'
,
metrics
=
{
"
Output_LON
"
:
lon_accuracy
(),
"
Output_LAT
"
:
lat_accuracy
()})
checkpoint
=
ModelCheckpoint
(
MODEL_OUTPUT_FN
+
"
.part
"
,
monitor
=
'
loss
'
,
verbose
=
1
,
save_best_only
=
True
,
mode
=
'
auto
'
,
period
=
1
)
#############################################################################################
#############################################################################################
################################# TRAINING LAUNCH ###########################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
#############################################################################################
checkpoint
=
ModelCheckpoint
(
MODEL_OUTPUT_FN
+
"
.part
"
,
monitor
=
'
loss
'
,
verbose
=
1
,
save_best_only
=
True
,
mode
=
'
auto
'
,
period
=
1
)
history
=
model
.
fit
(
x
=
[
X_1_train
,
X_2_train
],
history
=
model
.
fit
(
x
=
[
X_1_train
,
X_2_train
],
y
=
[
y_lon_train
,
y_lat_train
],
y
=
[
y_lon_train
,
y_lat_train
],
verbose
=
True
,
batch_size
=
100
,
verbose
=
True
,
batch_size
=
100
,
...
@@ -346,5 +349,5 @@ hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN))
...
@@ -346,5 +349,5 @@ hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN))
model
.
save
(
MODEL_OUTPUT_FN
)
model
.
save
(
MODEL_OUTPUT_FN
)
# Erase Model Checkpoint file
# Erase Model Checkpoint file
if
os
.
path
.
exists
(
output_fn
+
"
.part
"
):
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
os
.
remove
(
output_fn
+
"
.part
"
)
os
.
remove
(
MODEL_OUTPUT_FN
+
"
.part
"
)
\ No newline at end of file
\ No newline at end of file
This diff is collapsed.
Click to expand it.
helpers.py
+
67
−
51
View file @
c1530d9e
...
@@ -21,49 +21,58 @@ def read_geonames(file):
...
@@ -21,49 +21,58 @@ def read_geonames(file):
geonames data
geonames data
"""
"""
dtypes_dict
=
{
dtypes_dict
=
{
0
:
int
,
# geonameid
0
:
int
,
# geonameid
1
:
str
,
# name
1
:
str
,
# name
2
:
str
,
# asciiname
2
:
str
,
# asciiname
3
:
str
,
# alternatenames
3
:
str
,
# alternatenames
4
:
float
,
# latitude
4
:
float
,
# latitude
5
:
float
,
# longitude
5
:
float
,
# longitude
6
:
str
,
# feature class
6
:
str
,
# feature class
7
:
str
,
# feature code
7
:
str
,
# feature code
8
:
str
,
# country code
8
:
str
,
# country code
9
:
str
,
# cc2
9
:
str
,
# cc2
10
:
str
,
# admin1 code
10
:
str
,
# admin1 code
11
:
str
,
# admin2 code
11
:
str
,
# admin2 code
12
:
str
,
# admin3 code
12
:
str
,
# admin3 code
13
:
str
,
# admin4 code
13
:
str
,
# admin4 code
14
:
int
,
# population
14
:
int
,
# population
15
:
str
,
# elevation
15
:
str
,
# elevation
16
:
int
,
# dem (digital elevation model)
16
:
int
,
# dem (digital elevation model)
17
:
str
,
# timezone
17
:
str
,
# timezone
18
:
str
# modification date yyyy-MM-dd
18
:
str
,
# modification date yyyy-MM-dd
}
}
rename_cols
=
{
rename_cols
=
{
0
:
"
geonameid
"
,
# geonameid
0
:
"
geonameid
"
,
# geonameid
1
:
"
name
"
,
# name
1
:
"
name
"
,
# name
2
:
"
asciiname
"
,
# asciiname
2
:
"
asciiname
"
,
# asciiname
3
:
"
alternatenames
"
,
# alternatenames
3
:
"
alternatenames
"
,
# alternatenames
4
:
"
latitude
"
,
# latitude
4
:
"
latitude
"
,
# latitude
5
:
"
longitude
"
,
# longitude
5
:
"
longitude
"
,
# longitude
6
:
"
feature_class
"
,
# feature class
6
:
"
feature_class
"
,
# feature class
7
:
"
feature_code
"
,
# feature code
7
:
"
feature_code
"
,
# feature code
8
:
"
country_code
"
,
# country code
8
:
"
country_code
"
,
# country code
9
:
"
cc2
"
,
# cc2
9
:
"
cc2
"
,
# cc2
10
:
"
admin1_code
"
,
# admin1 code
10
:
"
admin1_code
"
,
# admin1 code
11
:
"
admin2_code
"
,
# admin2 code
11
:
"
admin2_code
"
,
# admin2 code
12
:
"
admin3_code
"
,
# admin3 code
12
:
"
admin3_code
"
,
# admin3 code
13
:
"
admin4_code
"
,
# admin4 code
13
:
"
admin4_code
"
,
# admin4 code
14
:
"
population
"
,
# population
14
:
"
population
"
,
# population
15
:
"
elevation
"
,
# elevation
15
:
"
elevation
"
,
# elevation
16
:
"
dem
"
,
# dem (digital elevation model)
16
:
"
dem
"
,
# dem (digital elevation model)
17
:
"
timezone
"
,
# timezone
17
:
"
timezone
"
,
# timezone
18
:
"
modification_date
"
# modification date yyyy-MM-dd
18
:
"
modification_date
"
,
# modification date yyyy-MM-dd
}
}
data
=
pd
.
read_csv
(
file
,
sep
=
"
\t
"
,
header
=
None
,
quoting
=
3
,
dtype
=
dtypes_dict
,
na_values
=
''
,
keep_default_na
=
False
,
error_bad_lines
=
False
)
data
=
pd
.
read_csv
(
data
.
rename
(
columns
=
rename_cols
,
inplace
=
True
)
file
,
sep
=
"
\t
"
,
header
=
None
,
quoting
=
3
,
dtype
=
dtypes_dict
,
na_values
=
""
,
keep_default_na
=
False
,
error_bad_lines
=
False
,
)
data
.
rename
(
columns
=
rename_cols
,
inplace
=
True
)
return
data
return
data
...
@@ -81,10 +90,10 @@ def parse_title_wiki(title_wiki):
...
@@ -81,10 +90,10 @@ def parse_title_wiki(title_wiki):
str
str
parsed wikipedia title
parsed wikipedia title
"""
"""
return
re
.
sub
(
"
\(.*\)
"
,
""
,
title_wiki
).
strip
().
lower
()
return
re
.
sub
(
"
\(.*\)
"
,
""
,
title_wiki
).
strip
().
lower
()
def
_split
(
lst
,
n
,
complete_chunk_value
):
def
_split
(
lst
,
n
,
complete_chunk_value
):
"""
"""
Split a list into chunk of n-size.
Split a list into chunk of n-size.
...
@@ -102,17 +111,19 @@ def _split(lst,n,complete_chunk_value):
...
@@ -102,17 +111,19 @@ def _split(lst,n,complete_chunk_value):
list
list
chunked list
chunked list
"""
"""
chunks
=
[
lst
[
i
:
i
+
n
]
for
i
in
range
(
0
,
len
(
lst
),
n
)]
chunks
=
[
lst
[
i
:
i
+
n
]
for
i
in
range
(
0
,
len
(
lst
),
n
)]
if
not
chunks
:
return
chunks
if
not
chunks
:
return
chunks
if
len
(
chunks
[
-
1
])
!=
n
:
if
len
(
chunks
[
-
1
])
!=
n
:
chunks
[
-
1
].
extend
([
complete_chunk_value
]
*
(
n
-
len
(
chunks
[
-
1
])))
chunks
[
-
1
].
extend
([
complete_chunk_value
]
*
(
n
-
len
(
chunks
[
-
1
])))
return
np
.
array
(
chunks
)
return
np
.
array
(
chunks
)
class
Chronometer
():
class
Chronometer
:
def
__init__
(
self
):
def
__init__
(
self
):
self
.
__task_begin_timestamp
=
{}
self
.
__task_begin_timestamp
=
{}
def
start
(
self
,
task_name
):
def
start
(
self
,
task_name
):
"""
"""
Start a new task chronometer
Start a new task chronometer
...
@@ -127,10 +138,12 @@ class Chronometer():
...
@@ -127,10 +138,12 @@ class Chronometer():
if a running task already exists with that name
if a running task already exists with that name
"""
"""
if
task_name
in
self
.
__task_begin_timestamp
:
if
task_name
in
self
.
__task_begin_timestamp
:
raise
ValueError
(
"
A running task exists with the name {0}!
"
.
format
(
task_name
))
raise
ValueError
(
"
A running task exists with the name {0}!
"
.
format
(
task_name
)
)
self
.
__task_begin_timestamp
[
task_name
]
=
time
.
time
()
self
.
__task_begin_timestamp
[
task_name
]
=
time
.
time
()
def
stop
(
self
,
task_name
):
def
stop
(
self
,
task_name
):
"""
"""
Stop and return the duration of the task
Stop and return the duration of the task
...
@@ -150,11 +163,14 @@ class Chronometer():
...
@@ -150,11 +163,14 @@ class Chronometer():
if no task exist with the id `task_name`
if no task exist with the id `task_name`
"""
"""
if
not
task_name
in
self
.
__task_begin_timestamp
:
if
not
task_name
in
self
.
__task_begin_timestamp
:
raise
ValueError
(
"
The {0} task does not exist!
"
.
format
(
task_name
))
raise
ValueError
(
"
The {0} task does not exist!
"
.
format
(
task_name
))
duration
=
time
.
time
()
-
self
.
__task_begin_timestamp
[
task_name
]
duration
=
time
.
time
()
-
self
.
__task_begin_timestamp
[
task_name
]
del
self
.
__task_begin_timestamp
[
task_name
]
del
self
.
__task_begin_timestamp
[
task_name
]
return
duration
return
duration
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
chrono
=
Chronometer
()
chrono
=
Chronometer
()
chrono
.
start
(
"
test
"
)
chrono
.
start
(
"
test
"
)
...
@@ -162,4 +178,4 @@ if __name__ == "__main__":
...
@@ -162,4 +178,4 @@ if __name__ == "__main__":
time
.
sleep
(
3
)
time
.
sleep
(
3
)
print
(
chrono
.
stop
(
"
test
"
))
print
(
chrono
.
stop
(
"
test
"
))
time
.
sleep
(
3
)
time
.
sleep
(
3
)
print
(
chrono
.
stop
(
"
test2
"
))
print
(
chrono
.
stop
(
"
test2
"
))
\ No newline at end of file
This diff is collapsed.
Click to expand it.
parser_config/toponym_combination_embedding.json
+
1
−
1
View file @
c1530d9e
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
{
"long"
:
"--cooc-sample-size"
,
"type"
:
"int"
,
"default"
:
3
},
{
"long"
:
"--cooc-sample-size"
,
"type"
:
"int"
,
"default"
:
3
},
{
"long"
:
"--adjacency-iteration"
,
"type"
:
"int"
,
"default"
:
1
},
{
"long"
:
"--adjacency-iteration"
,
"type"
:
"int"
,
"default"
:
1
},
{
"short"
:
"-n"
,
"long"
:
"--ngram-size"
,
"type"
:
"int"
,
"default"
:
2
},
{
"short"
:
"-n"
,
"long"
:
"--ngram-size"
,
"type"
:
"int"
,
"default"
:
2
},
{
"long"
:
"--ngram-word2vec-
dim
"
,
"type"
:
"int"
,
"default"
:
50
},
{
"long"
:
"--ngram-word2vec-
iter
"
,
"type"
:
"int"
,
"default"
:
50
},
{
"short"
:
"-t"
,
"long"
:
"--tolerance-value"
,
"type"
:
"float"
,
"default"
:
0.002
},
{
"short"
:
"-t"
,
"long"
:
"--tolerance-value"
,
"type"
:
"float"
,
"default"
:
0.002
},
{
"short"
:
"-e"
,
"long"
:
"--epochs"
,
"type"
:
"int"
,
"default"
:
100
},
{
"short"
:
"-e"
,
"long"
:
"--epochs"
,
"type"
:
"int"
,
"default"
:
100
},
{
"short"
:
"-d"
,
"long"
:
"--dimension"
,
"type"
:
"int"
,
"default"
:
256
},
{
"short"
:
"-d"
,
"long"
:
"--dimension"
,
"type"
:
"int"
,
"default"
:
256
},
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment