Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
Place Embedding
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jacques Fize
Place Embedding
Commits
640cf7b1
Commit
640cf7b1
authored
5 years ago
by
Fize Jacques
Browse files
Options
Downloads
Patches
Plain Diff
Add adj rel to the process+ fatest adjacency computation using a grid
parent
fb3e6b21
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
combination_embeddings.py
+39
-16
39 additions, 16 deletions
combination_embeddings.py
parser_config/toponym_combination_embedding.json
+3
-0
3 additions, 0 deletions
parser_config/toponym_combination_embedding.json
utils.py
+140
-123
140 additions, 123 deletions
utils.py
with
182 additions
and
139 deletions
combination_embeddings.py
+
39
−
16
View file @
640cf7b1
...
...
@@ -2,10 +2,12 @@
import
os
import
sys
from
argparse
import
ArgumentParser
import
json
# Structure
import
pandas
as
pd
import
numpy
as
np
import
geopandas
as
gpd
# DEEPL module
from
keras.preprocessing.text
import
Tokenizer
...
...
@@ -25,7 +27,8 @@ from shapely.geometry import Point
# Custom module
from
helpers
import
read_geonames
from
utils
import
CoordinatesEncoder
,
zero_one_encoding
,
NgramIndex
,
ConfigurationReader
from
utils
import
Grid
from
utils
import
zero_one_encoding
,
NgramIndex
,
ConfigurationReader
# Visualisation module
...
...
@@ -47,10 +50,7 @@ logging.basicConfig(
)
chrono
=
Chronometer
()
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding.json
"
).
parse_args
()
args
=
ConfigurationReader
(
"
./parser_config/toponym_combination_embedding.json
"
).
parse_args
(
"
-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt
"
.
split
())
GEONAME_FN
=
args
.
geoname_input
GEONAMES_HIERARCHY_FN
=
args
.
geoname_hierachy_input
...
...
@@ -74,12 +74,35 @@ logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered
=
geoname_data
[
geoname_data
.
feature_class
.
isin
(
"
A P
"
.
split
())].
copy
()
# Only take area and populated places
# RETRIEVE INCLUSION RELATIONSHIPS
logging
.
info
(
"
Retrieve inclusion relationships !
"
)
geoname2name
=
dict
(
filtered
[
"
geonameid name
"
.
split
()].
values
)
filter_mask
=
(
hierarchy_data
.
childId
.
isin
(
geoname2name
)
&
hierarchy_data
.
parentId
.
isin
(
geoname2name
))
inclusion_dict
=
dict
(
hierarchy_data
[
filter_mask
][
"
childId parentId
"
.
split
()].
values
)
logging
.
info
(
"
{0} inclusion relationships retrieved !
"
.
format
(
len
(
inclusion_dict
)))
# RETRIEVE ADJACENCY
filtered
[
"
geometry
"
]
=
filtered
[
"
longitude latitude
"
.
split
()].
apply
(
lambda
x
:
Point
(
x
.
longitude
,
x
.
latitude
),
axis
=
1
)
filtered
=
gpd
.
GeoDataFrame
(
filtered
)
filtered
[
"
i
"
]
=
1
bounds
=
filtered
.
dissolve
(
"
i
"
).
bounds
.
values
[
0
]
rel_dict
=
{}
if
args
.
adjacency
:
fn
=
"
{0}_adjacency.json
"
.
format
(
GEONAME_FN
.
split
(
"
/
"
)[
-
1
])
if
not
os
.
path
.
exists
(
fn
):
g
=
Grid
(
*
bounds
,[
360
,
180
])
g
.
fit_data
(
filtered
)
[
g
+
(
int
(
row
.
geonameid
),
row
.
latitude
,
row
.
longitude
)
for
ix
,
row
in
tqdm
(
filtered
[
"
geonameid longitude latitude
"
.
split
()].
iterrows
(),
total
=
len
(
filtered
))]
rel_dict
.
update
(
dict
([[
int
(
i
)
for
i
in
r
.
split
(
"
|
"
)]
for
r
in
g
.
get_adjacent_relationships
()]))
json
.
dump
(
rel_dict
,
open
(
fn
,
'
w
'
))
else
:
rel_dict
.
update
(
json
.
load
(
open
(
fn
,
'
w
'
)))
if
args
.
inclusion
:
# RETRIEVE INCLUSION RELATIONSHIPS
logging
.
info
(
"
Retrieve inclusion relationships !
"
)
geoname2name
=
dict
(
filtered
[
"
geonameid name
"
.
split
()].
values
)
filter_mask
=
(
hierarchy_data
.
childId
.
isin
(
geoname2name
)
&
hierarchy_data
.
parentId
.
isin
(
geoname2name
))
rel_dict
.
update
(
dict
(
hierarchy_data
[
filter_mask
][
"
childId parentId
"
.
split
()].
values
))
logging
.
info
(
"
{0} inclusion relationships retrieved !
"
.
format
(
len
(
hierarchy_data
[
filter_mask
])))
# ENCODING NAME USING N-GRAM SPLITTING
logging
.
info
(
"
Encoding toponyms to ngram...
"
)
...
...
@@ -113,12 +136,12 @@ logging.info("Preparing Input and Output data...")
X_1
,
X_2
,
y_lat
,
y_lon
=
[],[],[],[]
X_3
=
[]
for
geonameId_1
,
geonameId_2
in
inclusion
_dict
.
items
():
if
not
geonameId_2
in
inclusion
_dict
:
for
geonameId_1
,
geonameId_2
in
rel
_dict
.
items
():
if
not
geonameId_2
in
rel
_dict
:
continue
geonameId_3
=
inclusion
_dict
[
geonameId_2
]
top3
=
geoname2encodedname
[
geonameId_3
]
X_3
.
append
(
top3
)
geonameId_3
=
rel
_dict
[
geonameId_2
]
#
top3 = geoname2encodedname[geonameId_3]
#
X_3.append(top3)
top1
,
top2
=
geoname2encodedname
[
geonameId_1
],
geoname2encodedname
[
geonameId_2
]
X_1
.
append
(
top1
)
...
...
This diff is collapsed.
Click to expand it.
parser_config/toponym_combination_embedding.json
+
3
−
0
View file @
640cf7b1
...
...
@@ -4,9 +4,12 @@
{
"short"
:
"geoname_input"
,
"help"
:
"Filepath of the Geonames file you want to use."
},
{
"short"
:
"geoname_hierachy_input"
,
"help"
:
"Filepath of the Geonames file you want to use."
},
{
"short"
:
"-v"
,
"long"
:
"--verbose"
,
"action"
:
"store_true"
},
{
"short"
:
"-i"
,
"long"
:
"--inclusion"
,
"action"
:
"store_true"
},
{
"short"
:
"-a"
,
"long"
:
"--adjacency"
,
"action"
:
"store_true"
},
{
"short"
:
"-n"
,
"long"
:
"--ngram-size"
,
"type"
:
"int"
,
"default"
:
2
},
{
"short"
:
"-t"
,
"long"
:
"--tolerance-value"
,
"type"
:
"float"
,
"default"
:
0.002
},
{
"short"
:
"-e"
,
"long"
:
"--epochs"
,
"type"
:
"int"
,
"default"
:
100
},
{
"short"
:
"-d"
,
"long"
:
"--dimension"
,
"type"
:
"int"
,
"default"
:
256
},
{
"short"
:
"-m"
,
"long"
:
"--model"
,
"choices"
:
[
"CNN"
,
"LSTM"
],
"default"
:
"CNN"
}
]
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
utils.py
+
140
−
123
View file @
640cf7b1
...
...
@@ -10,7 +10,7 @@ from ngram import NGram
import
argparse
import
os
import
json
from
tqdm
import
tqdm
class
TokenizerCustom
():
...
...
@@ -24,128 +24,6 @@ class TokenizerCustom():
seqs
.
append
([
self
.
word_index
[
word
]
for
word
in
word_tokenize
(
text
)
if
word
in
self
.
word_index
])
return
seqs
class
CoordinatesEncoder
:
"""
Deprecated !
"""
def
__init__
(
self
,
cell_size_lat
=
0.5
,
cell_size_lon
=
0.5
):
self
.
min_lon
=
-
180
self
.
max_lon
=
-
(
self
.
min_lon
)
# Symetric
self
.
min_lat
=
-
90
self
.
max_lat
=
-
(
self
.
min_lat
)
# Symetric
self
.
ecart_lat
=
self
.
max_lat
-
self
.
min_lat
self
.
ecart_lon
=
self
.
max_lon
-
self
.
min_lon
self
.
cell_size_lat
=
cell_size_lat
self
.
cell_size_lon
=
cell_size_lon
self
.
unit_size_lat
=
self
.
ecart_lat
/
self
.
cell_size_lat
self
.
unit_size_lon
=
self
.
ecart_lon
/
self
.
cell_size_lon
def
encode
(
self
,
lat
,
lon
):
return
(
math
.
floor
(((
lat
+
self
.
max_lat
)
/
self
.
ecart_lat
)
*
self
.
unit_size_lat
),
math
.
floor
(((
lon
+
self
.
max_lon
)
/
self
.
ecart_lon
)
*
(
self
.
unit_size_lon
))
)
def
number_lat_cell
(
self
):
return
int
(
self
.
unit_size_lat
)
def
number_lon_cell
(
self
):
return
int
(
self
.
unit_size_lon
)
def
oneDimensionOutputSize
(
self
):
return
self
.
number_lat_cell
()
*
self
.
number_lon_cell
()
def
vector
(
self
,
lat
,
lon
):
lat_v
,
lon_v
=
np
.
zeros
(
self
.
number_lat_cell
()),
np
.
zeros
(
self
.
number_lon_cell
())
new_coords
=
self
.
encode
(
lat
,
lon
)
lat_v
[
int
(
new_coords
[
0
])]
=
1
lon_v
[
int
(
new_coords
[
1
])]
=
1
return
lat_v
,
lon_v
def
vector_flatten
(
self
,
lat
,
lon
):
vec
=
np
.
zeros
(
self
.
oneDimensionOutputSize
())
# 2D Dense softmax isn't possible
new_coords
=
self
.
encode
(
lat
,
lon
)
pos
=
self
.
number_lat_cell
()
*
(
new_coords
[
0
])
+
new_coords
[
1
]
vec
[
pos
]
=
1
#lon * lon size
return
vec
class
Quadtree
(
object
):
def
__init__
(
self
,
upperleft_x
,
upperleft_y
,
bottomright_x
,
bottomright_y
,
precision
=
10
,
curr_prec
=
0
):
self
.
upperleft_x
,
self
.
upperleft_y
,
self
.
bottomright_x
,
self
.
bottomright_y
=
upperleft_x
,
upperleft_y
,
bottomright_x
,
bottomright_y
self
.
precision
=
precision
x_r
=
abs
(
self
.
bottomright_x
-
self
.
upperleft_x
)
/
2
y_r
=
abs
(
self
.
upperleft_y
-
self
.
bottomright_y
)
/
2
# if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]:
if
curr_prec
==
precision
:
self
.
value
=
""
else
:
#print(ix,x_r,y_r)#print(x_r,y_r)
self
.
value
=
[
Quadtree
(
upperleft_x
,
upperleft_y
,
bottomright_x
-
x_r
,
bottomright_y
+
y_r
,
precision
=
self
.
precision
,
curr_prec
=
curr_prec
+
1
),
Quadtree
(
upperleft_x
+
x_r
,
upperleft_y
,
bottomright_x
,
bottomright_y
+
y_r
,
precision
=
self
.
precision
,
curr_prec
=
curr_prec
+
1
),
Quadtree
(
upperleft_x
,
upperleft_y
-
y_r
,
bottomright_x
-
x_r
,
bottomright_y
,
precision
=
self
.
precision
,
curr_prec
=
curr_prec
+
1
),
Quadtree
(
upperleft_x
+
x_r
,
upperleft_y
-
y_r
,
bottomright_x
,
bottomright_y
,
precision
=
self
.
precision
,
curr_prec
=
curr_prec
+
1
)
]
def
contains_obj
(
self
,
pos
):
x
,
y
=
pos
[
0
],
pos
[
1
]
if
x
<
self
.
upperleft_x
or
x
>
self
.
bottomright_x
:
return
False
if
y
>
self
.
upperleft_y
or
y
<
self
.
bottomright_y
:
return
False
return
True
def
binary
(
self
,
integer
):
ch
=
"
{0:b}
"
.
format
(
integer
)
return
"
0
"
*
(
2
-
len
(
ch
))
+
ch
def
encode
(
self
,
pos
):
if
not
isinstance
(
self
.
value
,
list
):
return
""
for
ix
,
q
in
enumerate
(
self
.
value
):
if
q
.
contains_obj
(
pos
):
return
self
.
binary
(
ix
)
+
q
.
encode
(
pos
)
def
int_encode
(
self
,
pos
):
return
list
(
map
(
int
,
textwrap
.
wrap
(
self
.
encode
(
pos
),
1
)))
def
decode
(
self
,
hash_
):
if
not
len
(
hash_
)
%
2
==
0
:
raise
ValueError
(
"
Wrong Hash !
"
)
q_pos
=
eval
(
"
0b
"
+
hash_
[:
2
])
q
=
self
.
value
[
q_pos
]
if
len
(
hash_
)
==
2
:
return
q
.
upperleft_x
,
q
.
upperleft_y
,
q
.
bottomright_x
,
q
.
bottomright_y
return
q
.
decode
(
hash_
[
2
:])
from
keras.layers
import
Embedding
from
gensim.models
import
Word2Vec
...
...
@@ -199,6 +77,145 @@ def _split(lst,n,complete_chunk_value):
return
np
.
array
(
chunks
)
def
generate_couple
(
object_list
):
couples
=
[]
lst
=
np
.
arange
(
len
(
object_list
))
for
_
in
range
(
len
(
object_list
)):
if
len
(
lst
)
==
1
:
break
idx
=
np
.
random
.
choice
(
np
.
arange
(
len
(
lst
)))
idx2
=
np
.
random
.
choice
(
np
.
arange
(
len
(
lst
)))
while
idx2
==
idx
:
idx2
=
np
.
random
.
choice
(
np
.
arange
(
len
(
lst
)))
couples
.
append
([
object_list
[
lst
[
idx
]],
object_list
[
lst
[
idx2
]]])
lst
=
np
.
delete
(
lst
,
idx
)
return
couples
def
_hash_couple
(
o1
,
o2
):
return
"
|
"
.
join
(
map
(
str
,
sorted
([
int
(
o1
),
int
(
o2
)])))
### GEO ADJAC BEGIN
from
joblib
import
Parallel
,
delayed
from
shapely.geometry
import
Point
,
box
class
Cell
(
object
):
def
__init__
(
self
,
upperleft_x
,
upperleft_y
,
bottomright_x
,
bottomright_y
):
self
.
upperleft_x
,
self
.
upperleft_y
,
self
.
bottomright_x
,
self
.
bottomright_y
=
upperleft_x
,
upperleft_y
,
bottomright_x
,
bottomright_y
self
.
box_
=
box
(
self
.
upperleft_x
,
self
.
upperleft_y
,
self
.
bottomright_x
,
self
.
bottomright_y
)
self
.
list_object
=
{}
# {id:Point(coord)}
def
contains
(
self
,
lat
,
lon
):
x
,
y
=
lon
,
lat
if
x
<
self
.
upperleft_x
or
x
>
self
.
bottomright_x
:
return
False
if
y
<
self
.
upperleft_y
or
y
>
self
.
bottomright_y
:
return
False
return
True
def
add_object
(
self
,
id_
,
lat
,
lon
):
self
.
list_object
[
id_
]
=
Point
(
lon
,
lat
)
def
__repr__
(
self
):
return
"
upperleft:{0}_{1}_;bottom_right:{2}_{3}
"
.
format
(
self
.
upperleft_x
,
self
.
upperleft_y
,
self
.
bottomright_x
,
self
.
bottomright_y
)
class
Grid
(
object
):
def
__init__
(
self
,
upperleft_x
,
upperleft_y
,
bottomright_x
,
bottomright_y
,
cell_sub_div_index
=
[
100
,
50
]):
self
.
upperleft_x
,
self
.
upperleft_y
,
self
.
bottomright_x
,
self
.
bottomright_y
=
upperleft_x
,
upperleft_y
,
bottomright_x
,
bottomright_y
self
.
x_r
=
abs
(
self
.
bottomright_x
-
self
.
upperleft_x
)
/
cell_sub_div_index
[
0
]
self
.
y_r
=
abs
(
self
.
upperleft_y
-
self
.
bottomright_y
)
/
cell_sub_div_index
[
1
]
self
.
c_x_r
=
self
.
x_r
/
cell_sub_div_index
[
0
]
# Redivide
self
.
c_y_r
=
self
.
y_r
/
cell_sub_div_index
[
1
]
self
.
cells
=
[]
self
.
inter_cells
=
[]
for
i
in
range
(
cell_sub_div_index
[
1
]):
self
.
cells
.
append
([])
for
j
in
range
(
cell_sub_div_index
[
0
]):
self
.
cells
[
-
1
].
append
(
Cell
(
self
.
upperleft_x
+
j
*
self
.
x_r
,
self
.
upperleft_y
+
i
*
self
.
y_r
,
self
.
upperleft_x
+
((
j
+
1
)
*
self
.
x_r
),
self
.
upperleft_y
+
((
i
+
1
)
*
self
.
y_r
),
)
)
dec_y
=
0
for
i
in
range
(
cell_sub_div_index
[
1
]):
self
.
inter_cells
.
append
([])
dec_x
=
0
for
j
in
range
(
cell_sub_div_index
[
0
]):
self
.
inter_cells
[
-
1
].
append
(
Cell
(
self
.
upperleft_x
+
(
j
*
self
.
x_r
)
-
self
.
c_x_r
,
# TOP
self
.
upperleft_y
+
(
i
*
self
.
y_r
)
-
dec_y
,
self
.
upperleft_x
+
((
j
+
1
)
*
self
.
x_r
)
-
self
.
c_x_r
,
#(self.u_pos*self.c_x_r),
self
.
upperleft_y
+
((
i
+
1
)
*
self
.
y_r
)
+
self
.
c_y_r
#(self.u_neg*self.c_y_r),
)
)
self
.
inter_cells
[
-
1
].
append
(
Cell
(
self
.
upperleft_x
+
(
j
*
self
.
x_r
)
-
self
.
c_x_r
,
# CENTER
self
.
upperleft_y
+
(
i
*
self
.
y_r
)
-
self
.
c_y_r
,
self
.
upperleft_x
+
((
j
+
1
)
*
self
.
x_r
)
+
self
.
c_x_r
,
self
.
upperleft_y
+
((
i
+
1
)
*
self
.
y_r
)
+
self
.
c_y_r
,
)
)
self
.
inter_cells
[
-
1
].
append
(
Cell
(
self
.
upperleft_x
+
(
j
*
self
.
x_r
)
+
dec_x
,
# CENTER
self
.
upperleft_y
+
(
i
*
self
.
y_r
)
-
self
.
c_y_r
,
self
.
upperleft_x
+
((
j
+
1
)
*
self
.
x_r
)
-
self
.
c_x_r
,
#LEFT
self
.
upperleft_y
+
((
i
+
1
)
*
self
.
y_r
)
+
self
.
c_y_r
)
)
dec_x
=
self
.
c_x_r
dec_y
=
self
.
c_y_r
def
fit_data
(
self
,
data
):
data
[
"
nn
"
]
=
1
dissolved
=
data
.
dissolve
(
by
=
"
nn
"
)
new_cells
=
[]
new_inter_cells
=
[]
for
i
in
tqdm
(
range
(
len
(
self
.
cells
))):
for
j
in
range
(
len
(
self
.
cells
[
i
])):
if
dissolved
.
intersects
(
self
.
cells
[
i
][
j
].
box_
).
all
():
new_cells
.
append
(
self
.
cells
[
i
][
j
])
new_inter_cells
.
extend
(
self
.
inter_cells
[
i
][
j
*
3
:(
j
+
1
)
*
3
])
self
.
cells
=
new_cells
self
.
inter_cells
=
new_inter_cells
def
__add__
(
self
,
a
):
for
c1
in
range
(
len
(
self
.
cells
)):
if
self
.
cells
[
c1
].
contains
(
a
[
1
],
a
[
2
]):
self
.
cells
[
c1
].
add_object
(
*
a
)
for
c1
in
range
(
len
(
self
.
inter_cells
)):
if
self
.
inter_cells
[
c1
].
contains
(
a
[
1
],
a
[
2
]):
self
.
inter_cells
[
c1
].
add_object
(
*
a
)
break
def
get_adjacent_relationships
(
self
,
random_iteration
=
10
):
relationships
=
set
([])
for
c1
in
tqdm
(
range
(
len
(
self
.
cells
))):
for
i
in
range
(
random_iteration
):
for
t
in
generate_couple
(
list
(
self
.
cells
[
c1
].
list_object
.
keys
())):
relationships
.
add
(
_hash_couple
(
t
[
0
],
t
[
1
]))
for
c1
in
tqdm
(
range
(
len
(
self
.
inter_cells
))):
for
i
in
range
(
random_iteration
):
for
t
in
generate_couple
(
list
(
self
.
inter_cells
[
c1
].
list_object
.
keys
())):
relationships
.
add
(
_hash_couple
(
t
[
0
],
t
[
1
]))
return
relationships
### GEO ADJAC END
import
argparse
import
os
import
json
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment