Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Toponym Geocoding
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jacques Fize
Toponym Geocoding
Commits
810b1dcf
Commit
810b1dcf
authored
4 years ago
by
Jacques Fize
Browse files
Options
Downloads
Patches
Plain Diff
add new_version of train script
parent
196cc045
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
combination_embeddingsv3.py
+216
-0
216 additions, 0 deletions
combination_embeddingsv3.py
train_test_split_cooccurrence_data.py
+2
-28
2 additions, 28 deletions
train_test_split_cooccurrence_data.py
with
218 additions
and
28 deletions
combination_embeddingsv3.py
0 → 100644
+
216
−
0
View file @
810b1dcf
# Base module
import
os
# Structure
import
pandas
as
pd
import
numpy
as
np
# DEEPL module
from
keras.layers
import
Dense
,
Input
,
Embedding
,
concatenate
,
Bidirectional
,
LSTM
,
Dropout
,
GRU
from
keras.models
import
Model
from
keras.callbacks
import
ModelCheckpoint
from
tensorflow.keras.layers
import
Lambda
import
keras.backend
as
K
import
tensorflow
as
tf
from
lib.custom_layer
import
*
# Custom module
from
lib.ngram_index
import
NgramIndex
from
lib.utils
import
ConfigurationReader
,
MetaDataSerializer
,
LabelEncoder
from
lib.metrics
import
lat_accuracy
,
lon_accuracy
from
lib.data_generator
import
DataGenerator
,
CoOccurrences
,
load_embedding
,
Inclusion
,
Adjacency
from
lib.geo
import
haversine_tf
,
accuracy_k
,
haversine_tf_1circle
# Logging
import
logging
logging
.
getLogger
(
'
gensim
'
).
setLevel
(
logging
.
WARNING
)
from
helpers
import
EpochTimer
# LOGGING CONF
logging
.
basicConfig
(
format
=
'
[%(asctime)s][%(levelname)s] %(message)s
'
,
datefmt
=
'
%m/%d/%Y %I:%M:%S %p
'
,
level
=
logging
.
INFO
)
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding_v2.json
"
)
\
.
parse_args
()
#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE
=
args
.
ngram_size
ACCURACY_TOLERANCE
=
args
.
k_value
EPOCHS
=
args
.
epochs
ADJACENCY_SAMPLING
=
args
.
adjacency_sample
COOC_SAMPLING
=
args
.
cooc_sample
WORDVEC_ITER
=
50
EMBEDDING_DIM
=
args
.
dimension
BATCH_SIZE
=
args
.
batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if
not
os
.
path
.
exists
(
"
outputs/
"
):
os
.
makedirs
(
"
outputs/
"
)
GEONAME_FN
=
args
.
geoname_input
DATASET_NAME
=
args
.
geoname_input
.
split
(
"
/
"
)[
-
1
]
GEONAMES_HIERARCHY_FN
=
args
.
inclusion_fn
ADJACENCY_REL_FILENAME
=
args
.
adjacency_fn
COOC_FN
=
args
.
wikipedia_cooc_fn
PREFIX_OUTPUT_FN
=
"
{0}_{1}_{2}_{3}
"
.
format
(
GEONAME_FN
.
split
(
"
/
"
)[
-
1
],
EPOCHS
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
)
REL_CODE
=
""
if
args
.
adjacency
:
PREFIX_OUTPUT_FN
+=
"
_A
"
REL_CODE
+=
"
A
"
if
args
.
inclusion
:
PREFIX_OUTPUT_FN
+=
"
_I
"
REL_CODE
+=
"
I
"
if
args
.
wikipedia_cooc
:
PREFIX_OUTPUT_FN
+=
"
_C
"
REL_CODE
+=
"
C
"
MODEL_OUTPUT_FN
=
"
outputs/{0}.h5
"
.
format
(
PREFIX_OUTPUT_FN
)
INDEX_FN
=
"
outputs/{0}_index
"
.
format
(
PREFIX_OUTPUT_FN
)
HISTORY_FN
=
"
outputs/{0}.csv
"
.
format
(
PREFIX_OUTPUT_FN
)
meta_data
=
MetaDataSerializer
(
DATASET_NAME
,
REL_CODE
,
COOC_SAMPLING
,
ADJACENCY_SAMPLING
,
NGRAM_SIZE
,
ACCURACY_TOLERANCE
,
EPOCHS
,
EMBEDDING_DIM
,
WORDVEC_ITER
,
INDEX_FN
,
MODEL_OUTPUT_FN
,
HISTORY_FN
)
meta_data
.
save
(
"
outputs/{0}.json
"
.
format
(
PREFIX_OUTPUT_FN
))
### PUT DATASRC + GENERATOR
index
=
NgramIndex
.
load
(
args
.
ngram_index_fn
)
train_src
=
[]
test_src
=
[]
class_encoder
=
LabelEncoder
()
if
args
.
wikipedia_cooc
:
train_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_train.csv
"
,
class_encoder
,
sampling
=
4
,
use_healpix
=
False
))
test_src
.
append
(
CoOccurrences
(
COOC_FN
+
"
_test.csv
"
,
class_encoder
,
sampling
=
4
,
use_healpix
=
False
))
if
args
.
adjacency
:
a_train
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_train.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
a_test
=
Adjacency
(
ADJACENCY_REL_FILENAME
+
"
_test.csv
"
,
GEONAME_FN
,
sampling
=
ADJACENCY_SAMPLING
,
gzip
=
False
)
train_src
.
append
(
a_train
)
test_src
.
append
(
a_test
)
if
args
.
inclusion
:
i_train
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_train.csv
"
)
i_test
=
Inclusion
(
GEONAME_FN
,
GEONAMES_HIERARCHY_FN
+
"
_test.csv
"
)
train_src
.
append
(
i_train
)
test_src
.
append
(
i_test
)
#Adjacency
print
(
"
Number of classes:
"
,
class_encoder
.
get_num_classes
())
d_train
=
DataGenerator
(
train_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
d_test
=
DataGenerator
(
test_src
,
index
,
class_encoder
,
batch_size
=
BATCH_SIZE
)
num_words
=
len
(
index
.
index_ngram
)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights
=
load_embedding
(
args
.
embedding_fn
)
EMBEDDING_DIM
=
len
(
embedding_weights
[
0
])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from
keras
import
regularizers
####
input_1
=
Input
(
shape
=
(
index
.
max_len
,))
input_2
=
Input
(
shape
=
(
index
.
max_len
,))
embedding_layer
=
Embedding
(
num_words
,
EMBEDDING_DIM
,
input_length
=
index
.
max_len
,
trainable
=
False
)
#, trainable=True)
x1
=
embedding_layer
(
input_1
)
x2
=
embedding_layer
(
input_2
)
# Each LSTM learn on a permutation of the input toponyms
biLSTM
=
Bidirectional
(
GRU
(
128
,
activation
=
"
pentanh
"
,
recurrent_activation
=
"
pentanh
"
))
x1
=
biLSTM
(
x1
)
x2
=
biLSTM
(
x2
)
x
=
concatenate
([
x1
,
x2
])
#,x3])
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x1
=
Dropout
(
0.3
)(
x1
)
x1
=
Dense
(
500
,
activation
=
"
relu
"
)(
x1
)
x1
=
Dropout
(
0.3
)(
x1
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x
)
x2
=
Dropout
(
0.3
)(
x2
)
x2
=
Dense
(
500
,
activation
=
"
relu
"
)(
x2
)
x2
=
Dropout
(
0.3
)(
x2
)
#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
output_lon
=
Dense
(
1
,
activation
=
"
sigmoid
"
)(
x1
)
output_lat
=
Dense
(
1
,
activation
=
"
sigmoid
"
)(
x2
)
output_coord
=
concatenate
([
output_lon
,
output_lat
],
name
=
"
output_coord
"
)
#####
model
=
Model
(
inputs
=
[
input_1
,
input_2
],
outputs
=
output_coord
)
#input_3
model
.
compile
(
loss
=
{
"
output_coord
"
:
haversine_tf_1circle
},
optimizer
=
'
adam
'
,
metrics
=
{
"
output_coord
"
:
accuracy_k
(
ACCURACY_TOLERANCE
)})
model
.
summary
()
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint
=
ModelCheckpoint
(
MODEL_OUTPUT_FN
+
"
.part
"
,
monitor
=
'
loss
'
,
verbose
=
1
,
save_best_only
=
True
,
mode
=
'
auto
'
,
period
=
1
)
epoch_timer
=
EpochTimer
(
"
outputs/
"
+
PREFIX_OUTPUT_FN
+
"
_epoch_timer_output.csv
"
)
history
=
model
.
fit_generator
(
generator
=
d_train
,
validation_data
=
d_test
,
verbose
=
True
,
epochs
=
EPOCHS
,
callbacks
=
[
checkpoint
,
epoch_timer
])
hist_df
=
pd
.
DataFrame
(
history
.
history
)
hist_df
.
to_csv
(
HISTORY_FN
)
model
.
save
(
MODEL_OUTPUT_FN
)
# Erase Model Checkpoint file
if
os
.
path
.
exists
(
MODEL_OUTPUT_FN
+
"
.part
"
):
os
.
remove
(
MODEL_OUTPUT_FN
+
"
.part
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
train_test_split_cooccurrence_data.py
+
2
−
28
View file @
810b1dcf
...
...
@@ -14,7 +14,7 @@ logging.basicConfig(
from
sklearn.model_selection
import
train_test_split
from
shapely.geometry
import
Point
from
lib.geo
import
Grid
,
latlon2healpix
from
lib.geo
import
latlon2healpix
from
tqdm
import
tqdm
...
...
@@ -27,33 +27,10 @@ args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data
# LOAD DATAgeopandas
COOC_FN
=
args
.
cooccurrence_file
logging
.
info
(
"
Load Cooc DATA data...
"
)
cooc_data
=
pd
.
read_csv
(
COOC_FN
,
sep
=
"
\t
"
).
fillna
(
""
)
# cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
# cooc_data = gpd.GeoDataFrame(cooc_data)
logging
.
info
(
"
Cooc data loaded!
"
)
# # World Shape bounds
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# world["nn"] = 1
# dissolved = world.dissolve(by="nn").iloc[0].geometry
# #Creating Grid
# logging.info("Initializing Grid (360,180)...")
# g = Grid(*dissolved.bounds,[360,180])
# logging.info("Fit Data to the Grid...")
# g.fit_data(cooc_data)
# logging.info("Placing place into the grid...")
# [g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))]
# #ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
# logging.info("Associate a cell number to each place in the Geoname Dataframe")
# def foo(g,id_):
# for ix,cell in enumerate(g.cells):
# if id_ in cell.list_object:
# return ix
cooc_data
[
"
cat
"
]
=
cooc_data
.
apply
(
lambda
x
:
latlon2healpix
(
x
.
latitude
,
x
.
longitude
,
64
),
axis
=
1
)
...
...
@@ -79,12 +56,9 @@ for i in np.unique(cooc_data.cat.values):
except
Exception
as
e
:
print
(
e
)
#print("Error",len(filtered[filtered.cat == i]))
# del X_train["geometry"]
# del X_train["nn"]
del
X_train
[
"
cat
"
]
del
X_test
[
"
cat
"
]
# del X_test["geometry"]
# del X_test["nn"]
# SAVING THE DATA
logging
.
info
(
"
Saving Output !
"
)
suffix
=
""
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment