Skip to content
Snippets Groups Projects
Commit 7496fba4 authored by Jacques Fize's avatar Jacques Fize
Browse files

Cleaning and Organizing the code+ Add function to fast compute every adjacency rel available

parent 46f8b391
No related branches found
No related tags found
No related merge requests found
...@@ -14,7 +14,6 @@ dist/ ...@@ -14,7 +14,6 @@ dist/
downloads/ downloads/
eggs/ eggs/
.eggs/ .eggs/
lib/
lib64/ lib64/
parts/ parts/
sdist/ sdist/
......
...@@ -12,6 +12,8 @@ import geopandas as gpd ...@@ -12,6 +12,8 @@ import geopandas as gpd
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
from keras.models import Model from keras.models import Model
from keras import backend as K from keras import backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf import tensorflow as tf
# Geometry # Geometry
...@@ -19,31 +21,15 @@ from shapely.geometry import Point ...@@ -19,31 +21,15 @@ from shapely.geometry import Point
# Custom module # Custom module
from helpers import read_geonames from helpers import read_geonames
from utils import Grid from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
from utils import zero_one_encoding, NgramIndex,ConfigurationReader from lib.ngram_index import NgramIndex
from metrics import lat_accuracy,lon_accuracy from lib.utils import ConfigurationReader
from lib.metrics import lat_accuracy,lon_accuracy
# Logging # Logging
from tqdm import tqdm from tqdm import tqdm
import logging import logging
from helpers import Chronometer from helpers import parse_title_wiki
def parse_title_wiki(title_wiki):
"""
Parse Wikipedia title
Parameters
----------
title_wiki : str
wikipedia title
Returns
-------
str
parsed wikipedia title
"""
return re.sub("\(.*\)","",title_wiki).strip().lower()
def get_new_ids(cooc_data,id_first_value): def get_new_ids(cooc_data,id_first_value):
""" """
...@@ -74,96 +60,122 @@ def get_new_ids(cooc_data,id_first_value): ...@@ -74,96 +60,122 @@ def get_new_ids(cooc_data,id_first_value):
topo_id[id_]=interlink topo_id[id_]=interlink
return topo_id return topo_id
# LOGGING CONF # LOGGING CONF
logging.basicConfig( logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ', format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p', datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO level=logging.INFO
) )
chrono = Chronometer()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
.parse_args()#("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) .parse_args()#("-i -e 5 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
# Initialisee CONSTANTS #
GEONAME_FN = args.geoname_input #################################################
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input ############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = 3 COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = 50 WORDVEC_ITER = args.ngram_word2vec_dim
#################################################
########## FILENAME VARIABLE ####################
#################################################
GEONAME_FN = args.geoname_input
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
ADJACENCY_REL_FILENAME = "../data/geonamesData/{0}_{1}{2}adjacency.json".format(
GEONAME_FN.split("/")[-1],
ITER_ADJACENCY,
REGION_SUFFIX_FN)
COOC_FN = "../data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1]
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
REGION_SUFFIX_FN)
# check for output dir if args.adjacency:
if not os.path.exists("outputs/"): PREFIX_OUTPUT_FN += "_A"
os.makedirs("outputs/") if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
#############################################################################################
################################# LOAD DATA #################################################
#############################################################################################
# LOAD Geonames DATA # LOAD Geonames DATA
logging.info("Load Geonames data...") logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("") geoname_data = read_geonames(GEONAME_FN).fillna("")
hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
train_indices,test_indices = set(train_indices),set(test_indices) test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
logging.info("Geonames data loaded!") logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places) # SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
#CLEAR RAM
del geoname_data
# IF REGION (ONLY FR for now !)
admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split() # IF REGION
region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1 if args.admin_code_1 != "None":
if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth:
filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
# REDUCE DATA STORED # GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
bounds = get_bounds(filtered) # Required to get adjacency relationships
# Geometry operation #############################################################################################
filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) ################################# RETRIEVE RELATIONSHIPS ####################################
filtered = gpd.GeoDataFrame(filtered) #############################################################################################
filtered["i"]=1
bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships
# INITIALIZE RELATION STORE
rel_store = [] rel_store = []
# Retrieve adjacency relationships
if args.adjacency: if args.adjacency:
# RETRIEVE ADJACENCY REL
logging.info("Retrieve adjacency relationships ! ") logging.info("Retrieve adjacency relationships ! ")
fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn)
if not os.path.exists(fn): if not os.path.exists(ADJACENCY_REL_FILENAME):
g = Grid(*bounds,[360,180]) rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
g.fit_data(filtered) json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])
json.dump(rel_store,open(fn,'w'))
else: else:
logging.info("Open and load data from previous computation!") logging.info("Open and load data from previous computation!")
rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))] rel_store=json.load(open(ADJACENCY_REL_FILENAME))
logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
# Retrieve inclusion relationships
if args.inclusion: if args.inclusion:
# RETRIEVE INCLUSION RELATIONSHIPS
logging.info("Retrieve inclusion relationships ! ") logging.info("Retrieve inclusion relationships ! ")
geonamesIDS = set(filtered.geonameid.values)
filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS))
rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist()))
logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))
del filtered["geometry"] cpt_rel = len(rel_store)
rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
if args.wikipedia_cooc: if args.wikipedia_cooc:
logging.info("Load Wikipedia Cooccurrence data and merge with geonames") logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1]
cooc_data = pd.read_csv(COOC_FN,sep="\t") cooc_data = pd.read_csv(COOC_FN,sep="\t")
cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max()) id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()}
cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
...@@ -191,37 +203,30 @@ geoname2name = dict(filtered["geonameid name".split()].values) ...@@ -191,37 +203,30 @@ geoname2name = dict(filtered["geonameid name".split()].values)
# ENCODING NAME USING N-GRAM SPLITTING # ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...") logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE) index = NgramIndex(NGRAM_SIZE)
filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
if args.wikipedia_cooc:
[index.split_and_add(k) for k in wikipediatitle_id]
filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
if args.wikipedia_cooc:
extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()}
index.max_len = int(max_len) # For Index state dump # Identify all ngram available
filtered.name.apply(lambda x : index.split_and_add(x))
if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc:
extension = {k:index.complete(v,max_len) for k,v in extension.items()}
geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc: if args.wikipedia_cooc:
geoname2encodedname.update(extension) geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
# SAVE THE INDEX TO REUSE THE MODEL
index.save(INDEX_FN)
logging.info("Done !") logging.info("Done !")
#CLEAR RAM
del hierarchy_data #############################################################################################
del geoname_data ################################# ENCODE COORDINATES #################################################
#############################################################################################
# Encode each geonames entry coordinates # Encode each geonames entry coordinates
filtered["cell_vec"]=filtered.apply( geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
lambda x : zero_one_encoding(x.longitude,x.latitude),
axis=1
)
geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
# CLEAR RAM # CLEAR RAM
del filtered del filtered
...@@ -231,14 +236,17 @@ num_words = len(index.index_ngram) # necessary for the embedding matrix ...@@ -231,14 +236,17 @@ num_words = len(index.index_ngram) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...") logging.info("Preparing Input and Output data...")
#############################################################################################
################################# BUILD TRAIN/TEST DATASETS #################################
#############################################################################################
X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
cpt=0
for couple in rel_store: for couple in rel_store:
geonameId_1,geonameId_2 = couple[0],couple[1] geonameId_1,geonameId_2 = couple[0],couple[1]
if not geonameId_1 in geoname2encodedname: if not geonameId_1 in geoname2encodedname:
cpt+=1
continue continue
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
if geonameId_1 in train_indices: #and geonameId_2 in train_indices: if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
...@@ -270,29 +278,28 @@ y_lon_test = np.array(y_lon_test) ...@@ -270,29 +278,28 @@ y_lon_test = np.array(y_lon_test)
logging.info("Data prepared !") logging.info("Data prepared !")
# OUTPUT FN BASE # check for output dir
name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) if not os.path.exists("outputs/"):
if args.adjacency: os.makedirs("outputs/")
name += "_A"
if args.inclusion:
name += "_I"
if args.wikipedia_cooc:
name += "_C"
index.save("outputs/"+name+"_index") #############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
# NGRAM EMBDEDDING
logging.info("Generating N-GRAM Embedding...") logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=WORDVEC_ITER) embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=WORDVEC_ITER)
logging.info("Embedding generated !") logging.info("Embedding generated !")
# DEEP MODEL #############################################################################################
name = "LSTM_"+ name ################################# MODEL DEFINITION ##########################################
input_1 = Input(shape=(max_len,)) #############################################################################################
input_2 = Input(shape=(max_len,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) x1 = Bidirectional(LSTM(98))(embedding_layer(input_1))
x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) x2 = Bidirectional(LSTM(98))(embedding_layer(input_2))
...@@ -315,15 +322,29 @@ output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) ...@@ -315,15 +322,29 @@ output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
history = model.fit(x=[X_1_train,X_2_train], history = model.fit(x=[X_1_train,X_2_train],
y=[y_lon_train,y_lat_train], y=[y_lon_train,y_lat_train],
verbose=True, batch_size=100, verbose=True, batch_size=100,
epochs=EPOCHS, epochs=EPOCHS,
validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]),
callbacks=[checkpoint])
hist_df = pd.DataFrame(history.history) hist_df = pd.DataFrame(history.history)
hist_df.to_csv("outputs/{0}.csv".format(name)) hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN))
model.save("outputs/"+name+".h5") model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(output_fn + ".part"):
os.remove(output_fn + ".part")
\ No newline at end of file
documentation/imgs/first_approach.png

291 KiB

documentation/imgs/second_approach.png

447 KiB

documentation/imgs/third_approach.png

30.4 KiB

import os
import time
import re
import numpy as np
import pandas as pd
def read_geonames(file):
"""
Return a dataframe that contains Geonames data.
Parameters
----------
file : str
path of the Geonames Csv file
Returns
-------
pd.DataFrame
geonames data
"""
dtypes_dict = {
0: int, # geonameid
1: str, # name
2: str, # asciiname
3: str, # alternatenames
4: float, # latitude
5: float, # longitude
6: str, # feature class
7: str, # feature code
8: str, # country code
9: str, # cc2
10: str, # admin1 code
11: str, # admin2 code
12: str, # admin3 code
13: str, # admin4 code
14: int, # population
15: str, # elevation
16: int, # dem (digital elevation model)
17: str, # timezone
18: str # modification date yyyy-MM-dd
}
rename_cols = {
0:"geonameid", # geonameid
1:"name", # name
2:"asciiname", # asciiname
3:"alternatenames", # alternatenames
4:"latitude", # latitude
5:"longitude", # longitude
6:"feature_class", # feature class
7:"feature_code", # feature code
8:"country_code", # country code
9:"cc2", # cc2
10:"admin1_code", # admin1 code
11:"admin2_code", # admin2 code
12:"admin3_code", # admin3 code
13:"admin4_code", # admin4 code
14:"population", # population
15:"elevation", # elevation
16:"dem", # dem (digital elevation model)
17:"timezone", # timezone
18:"modification_date" # modification date yyyy-MM-dd
}
data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
data.rename(columns=rename_cols,inplace=True)
return data
def parse_title_wiki(title_wiki):
"""
Parse Wikipedia title
Parameters
----------
title_wiki : str
wikipedia title
Returns
-------
str
parsed wikipedia title
"""
return re.sub("\(.*\)","",title_wiki).strip().lower()
def _split(lst,n,complete_chunk_value):
"""
Split a list into chunk of n-size.
Parameters
----------
lst : list
input list
n : int
chunk size
complete_chunk_value : object
if last chunk size not equal to n, this value is used to complete it
Returns
-------
list
chunked list
"""
chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
if not chunks:return chunks
if len(chunks[-1]) != n:
chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
return np.array(chunks)
class Chronometer():
def __init__(self):
self.__task_begin_timestamp = {}
def start(self,task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError("A running task exists with the name {0}!".format(task_name))
self.__task_begin_timestamp[task_name] = time.time()
def stop(self,task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
if __name__ == "__main__":
chrono = Chronometer()
chrono.start("test")
chrono.start("test2")
time.sleep(3)
print(chrono.stop("test"))
time.sleep(3)
print(chrono.stop("test2"))
\ No newline at end of file
# Basic import
import math
import argparse
import os
import json
# Data Structure
import numpy as np
import geopandas as gpd import geopandas as gpd
from shapely.geometry import Point,box import numpy as np
import pandas as pd
# NLP
from nltk.tokenize import word_tokenize
from ngram import NGram
# Machine learning from shapely.geometry import Point,box
from gensim.models import Word2Vec
# Visualisation and parallelisation
from tqdm import tqdm from tqdm import tqdm
class TokenizerCustom(): import pandas as pd, numpy as np
def __init__(self,vocab): from numba import njit
self.word_index = {vocab[i]:i for i in range(len(vocab))} from helpers import read_geonames
self.index_word = {i:vocab[i] for i in range(len(vocab))} from tqdm import tqdm
self.N = len(self.index_word) from joblib import Parallel,delayed
def texts_to_sequences(self,listText):
seqs = []
for text in listText:
seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index])
return seqs
class CoordinatesEncoder:
"""
Will be replaced by Grid in grid2.py
"""
def __init__(self, cell_size_lat=0.5, cell_size_lon=0.5):
self.min_lon = -180
self.max_lon = -(self.min_lon) # Symetric
self.min_lat = -90
self.max_lat = -(self.min_lat) # Symetric
self.ecart_lat = self.max_lat - self.min_lat
self.ecart_lon = self.max_lon - self.min_lon
self.cell_size_lat = cell_size_lat
self.cell_size_lon = cell_size_lon
self.unit_size_lat = self.ecart_lat / self.cell_size_lat
self.unit_size_lon = self.ecart_lon / self.cell_size_lon
def encode(self, lat, lon):
return (
math.floor(((lat + self.max_lat) / self.ecart_lat) * self.unit_size_lat),
math.floor(((lon + self.max_lon) / self.ecart_lon) * (self.unit_size_lon))
)
def number_lat_cell(self):
return int(self.unit_size_lat)
def number_lon_cell(self):
return int(self.unit_size_lon)
def oneDimensionOutputSize(self):
return self.number_lat_cell() * self.number_lon_cell()
def vector(self, lat, lon):
lat_v, lon_v = np.zeros(self.number_lat_cell()), np.zeros(self.number_lon_cell())
new_coords = self.encode(lat, lon)
lat_v[int(new_coords[0])] = 1
lon_v[int(new_coords[1])] = 1
return lat_v, lon_v
def vector_flatten(self, lat, lon):
vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible
new_coords = self.encode(lat, lon)
pos = self.number_lat_cell() * (new_coords[0]) + new_coords[1]
vec[pos] = 1 # lon * lon size
return vec
class NgramIndex():
"""
Class used for encoding words in ngram representation
"""
def __init__(self,n):
"""
Constructor
Parameters
----------
n : int
ngram size
"""
self.ngram_gen = NGram(N=n)
self.size = n
self.ngram_index = {"":0}
self.index_ngram = {0:""}
self.cpt = 0
self.max_len = 0
def split_and_add(self,word):
"""
Split word in multiple ngram and add each one of them to the index
Parameters
----------
word : str
a word
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams]
def add(self,ngram): def haversine_pd(lon1, lat1, lon2, lat2):
""" lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
Add a ngram to the index dlon = lon2 - lon1
dlat = lat2 - lat1
Parameters a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
----------
ngram : str
ngram
"""
if not ngram in self.ngram_index:
self.cpt+=1
self.ngram_index[ngram]=self.cpt
self.index_ngram[self.cpt]=ngram
def encode(self,word):
"""
Return a ngram representation of a word
Parameters
----------
word : str
a word
Returns
-------
list of int
listfrom shapely.geometry import Point,box
of ngram index
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return [self.ngram_index[ng] for ng in ngrams]
def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
"""
Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
Parameters
----------
ngram_encoding : list of int
first encoding of a word
MAX_LEN : int
desired length of the encoding
filling_item : int, optional
ngram index you wish to use, by default 0
Returns
-------
list of int
list of ngram index
"""
assert len(ngram_encoding) <= MAX_LEN
diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff)
return ngram_encoding
def get_embedding_layer(self,texts,dim=100,**kwargs): return 6367 * 2 * np.arcsin(np.sqrt(a))
"""
Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
Parameters
----------
texts : list of [list of int]
list of encoded word
dim : int, optional
embedding dimension, by default 100
Returns
-------
np.array
embedding matrix
"""
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
embedding_matrix[i] = model.wv[str(i)]
return embedding_matrix
def save(self,fn):
"""
Save the NgramIndex
Parameters
----------
fn : str
output filename
"""
data = {
"ngram_size": self.size,
"ngram_index": self.ngram_index,
"cpt_state": self.cpt,
"max_len_state": self.max_len
}
json.dump(data,open(fn,'w'))
@staticmethod
def load(fn):
"""
Load a NgramIndex state from a file.
Parameters
----------
fn : str
input filename
Returns
-------
NgramIndex
ngram index
Raises
------
KeyError
raised if a required field does not appear in the input file
"""
try:
data = json.load(open(fn))
except json.JSONDecodeError:
print("Data file must be a JSON")
for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = NgramIndex(data["ngram_size"])
new_obj.ngram_index = data["ngram_index"]
new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
new_obj.cpt = data["cpt_state"]
new_obj.max_len = data["max_len_state"]
return new_obj
def get_adjacent(ids,lon1, lat1, lon2, lat2,threshold):
dist_ = haversine_pd(lon1, lat1, lon2, lat2)
return ids[dist_<threshold]
def zero_one_encoding(long,lat): def get_geonames_adjacency(geoname_data,threshold):
""" return Parallel(n_jobs=-1,backend="multiprocessing")(delayed(get_adjacent)(geoname_data.geonameid.values,
Encode coordinates (WGS84) between 0 and 1 geoname_data.longitude,
geoname_data.latitude,
Parameters row.longitude,
---------- row.latitude,
long : float threshold) for ix,row in tqdm(geoname_data.iterrows(),total=len(geoname_data)))
longitude value
lat : float
latitude value
Returns
-------
float,float
longitude, latitude
"""
return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0)
def _split(lst,n,complete_chunk_value):
"""
Split a list into chunk of n-size.
Parameters
----------
lst : list
input list
n : int
chunk size
complete_chunk_value : object
if last chunk size not equal to n, this value is used to complete it
Returns
-------
list
chunked list
"""
chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
if not chunks:return chunks
if len(chunks[-1]) != n:
chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
return np.array(chunks)
def generate_couple(object_list): def generate_couple(object_list):
""" """
...@@ -338,7 +85,24 @@ def _hash_couple(o1,o2): ...@@ -338,7 +85,24 @@ def _hash_couple(o1,o2):
### GEO ADJAC BEGIN def zero_one_encoding(long,lat):
"""
Encode coordinates (WGS84) between 0 and 1
Parameters
----------
long : float
longitude value
lat : float
latitude value
Returns
-------
float,float
longitude, latitude
"""
return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0)
class Cell(object): class Cell(object):
""" """
A cell is box placed in geeographical space. A cell is box placed in geeographical space.
...@@ -349,8 +113,6 @@ class Cell(object): ...@@ -349,8 +113,6 @@ class Cell(object):
Parameters Parameters
---------- ----------
object : [type]
[description]
upperleft_x : float upperleft_x : float
upperleft longitude upperleft longitude
upperleft_y : float upperleft_y : float
...@@ -410,6 +172,7 @@ class Cell(object): ...@@ -410,6 +172,7 @@ class Cell(object):
def __repr__(self): def __repr__(self):
return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
class Grid(object): class Grid(object):
""" """
...@@ -541,74 +304,32 @@ class Grid(object): ...@@ -541,74 +304,32 @@ class Grid(object):
""" """
relationships = set([]) relationships = set([])
for c1 in tqdm(range(len(self.cells))): for c1 in tqdm(range(len(self.cells))):
for i in range(random_iteration): for _ in range(random_iteration):
for t in generate_couple(list(self.cells[c1].list_object.keys())): for t in generate_couple(list(self.cells[c1].list_object.keys())):
relationships.add(_hash_couple(t[0],t[1])) relationships.add(_hash_couple(t[0],t[1]))
for c1 in tqdm(range(len(self.inter_cells))): for c1 in tqdm(range(len(self.inter_cells))):
for i in range(random_iteration): for _ in range(random_iteration):
for t in generate_couple(list(self.inter_cells[c1].list_object.keys())): for t in generate_couple(list(self.inter_cells[c1].list_object.keys())):
relationships.add(_hash_couple(t[0],t[1])) relationships.add(_hash_couple(t[0],t[1]))
return relationships return relationships
### GEO ADJAC END
class ConfigurationReader(object):
def __init__(self,configuration_file):
if not os.path.exists(configuration_file):
raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file))
self.configuration = json.load(open(configuration_file))
self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"])
self.parser = argparse.ArgumentParser(description=self.__argparser_desc)
self.parse_conf()
def parse_conf(self):
if not "args" in self.configuration:
raise argparse.ArgumentError("","No args given in the configuration file")
for dict_args in self.configuration["args"]:
if not isinstance(dict_args,dict):
raise ValueError("Args must be dictionnary")
short_command = dict_args.get("short",None)
long_command = dict_args.get("long",None)
if not short_command and not long_command:
raise ValueError("No command name was given !")
add_func_dict_= {}
if "help" in dict_args:
add_func_dict_["help"]= dict_args["help"]
if "default" in dict_args:
add_func_dict_["default"]= dict_args["default"]
if "action" in dict_args:
add_func_dict_["action"]= dict_args["action"]
if "type" in dict_args:
add_func_dict_["type"]= eval(dict_args["type"])
if "choices" in dict_args:
add_func_dict_["choices"]= dict_args["choices"]
if not (short_command and long_command):
command = (short_command if not long_command else long_command)
self.parser.add_argument(command,**add_func_dict_)
elif long_command and short_command:
self.parser.add_argument(short_command,long_command,**add_func_dict_)
def parse_args(self,input_=None):
if not input_:
return self.parser.parse_args()
return self.parser.parse_args(input_)
def get_adjacency_rels(geodataframe,bounds,subdiv_tuple,random_iter_adjacency):
g = Grid(*bounds,subdiv_tuple)
g.fit_data()
[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(geodataframe["geonameid longitude latitude".split()].iterrows(),total=len(geodataframe))]
return [[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(random_iter_adjacency)]
if __name__ == "__main__": def get_geonames_inclusion_rel(geonames_data,geonames_hierarchy_data_fn):
geonames_hierarchy_data = pd.read_csv(geonames_hierarchy_data_fn,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
geonamesIDS = set(geonames_data.geonameid.values)
filter_mask = (geonames_hierarchy_data.childId.isin(geonamesIDS) & geonames_hierarchy_data.parentId.isin(geonamesIDS))
return (geonames_hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())
index = NgramIndex(3) def get_bounds(geodataframe):
index.split_and_add("J'aime le paté") geodataframe["geometry"] = geodataframe["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
encoding = index.encode("xxxyyyy") geodataframe = gpd.GeoDataFrame(geodataframe)
index.complete(encoding,10) geodataframe["i"]=1
\ No newline at end of file return geodataframe.dissolve("i").bounds.values[0] # Required to get adjacency relationships
File moved
import json
import numpy as np
from ngram import NGram
# Machine learning
from gensim.models import Word2Vec
class NgramIndex():
"""
Class used for encoding words in ngram representation
"""
def __init__(self,n):
"""
Constructor
Parameters
----------
n : int
ngram size
"""
self.ngram_gen = NGram(N=n)
self.size = n
self.ngram_index = {"":0}
self.index_ngram = {0:""}
self.cpt = 0
self.max_len = 0
def split_and_add(self,word):
"""
Split word in multiple ngram and add each one of them to the index
Parameters
----------
word : str
a word
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams]
self.max_len = max(self.max_len,len(ngrams))
def add(self,ngram):
"""
Add a ngram to the index
Parameters
----------
ngram : str
ngram
"""
if not ngram in self.ngram_index:
self.cpt+=1
self.ngram_index[ngram]=self.cpt
self.index_ngram[self.cpt]=ngram
def encode(self,word):
"""
Return a ngram representation of a word
Parameters
----------
word : str
a word
Returns
-------
list of int
listfrom shapely.geometry import Point,box
of ngram index
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams],self.max_len)
def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
"""
Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
Parameters
----------
ngram_encoding : list of int
first encoding of a word
MAX_LEN : int
desired length of the encoding
filling_item : int, optional
ngram index you wish to use, by default 0
Returns
-------
list of int
list of ngram index
"""
assert len(ngram_encoding) <= MAX_LEN
diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff)
return ngram_encoding
def get_embedding_layer(self,texts,dim=100,**kwargs):
"""
Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
Parameters
----------
texts : list of [list of int]
list of encoded word
dim : int, optional
embedding dimension, by default 100
Returns
-------
np.array
embedding matrix
"""
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
embedding_matrix[i] = model.wv[str(i)]
return embedding_matrix
def save(self,fn):
"""
Save the NgramIndex
Parameters
----------
fn : str
output filename
"""
data = {
"ngram_size": self.size,
"ngram_index": self.ngram_index,
"cpt_state": self.cpt,
"max_len_state": self.max_len
}
json.dump(data,open(fn,'w'))
@staticmethod
def load(fn):
"""
Load a NgramIndex state from a file.
Parameters
----------
fn : str
input filename
Returns
-------
NgramIndex
ngram index
Raises
------
KeyError
raised if a required field does not appear in the input file
"""
try:
data = json.load(open(fn))
except json.JSONDecodeError:
print("Data file must be a JSON")
for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = NgramIndex(data["ngram_size"])
new_obj.ngram_index = data["ngram_index"]
new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
new_obj.cpt = data["cpt_state"]
new_obj.max_len = data["max_len_state"]
return new_obj
# Basic import
import math
import argparse
import os
import json
# Data Structure
import numpy as np
import geopandas as gpd
from shapely.geometry import Point,box
# NLP
from nltk.tokenize import word_tokenize
from ngram import NGram
# Visualisation and parallelisation
from tqdm import tqdm
class TokenizerCustom():
def __init__(self,vocab):
self.word_index = {vocab[i]:i for i in range(len(vocab))}
self.index_word = {i:vocab[i] for i in range(len(vocab))}
self.N = len(self.index_word)
def texts_to_sequences(self,listText):
seqs = []
for text in listText:
seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index])
return seqs
class ConfigurationReader(object):
def __init__(self,configuration_file):
if not os.path.exists(configuration_file):
raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file))
self.configuration = json.load(open(configuration_file))
self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"])
self.parser = argparse.ArgumentParser(description=self.__argparser_desc)
self.parse_conf()
def parse_conf(self):
if not "args" in self.configuration:
raise argparse.ArgumentError("","No args given in the configuration file")
for dict_args in self.configuration["args"]:
if not isinstance(dict_args,dict):
raise ValueError("Args must be dictionnary")
short_command = dict_args.get("short",None)
long_command = dict_args.get("long",None)
if not short_command and not long_command:
raise ValueError("No command name was given !")
add_func_dict_= {}
if "help" in dict_args:
add_func_dict_["help"]= dict_args["help"]
if "default" in dict_args:
add_func_dict_["default"]= dict_args["default"]
if "action" in dict_args:
add_func_dict_["action"]= dict_args["action"]
if "type" in dict_args:
add_func_dict_["type"]= eval(dict_args["type"])
if "choices" in dict_args:
add_func_dict_["choices"]= dict_args["choices"]
if not (short_command and long_command):
command = (short_command if not long_command else long_command)
self.parser.add_argument(command,**add_func_dict_)
elif long_command and short_command:
self.parser.add_argument(short_command,long_command,**add_func_dict_)
def parse_args(self,input_=None):
if not input_:
return self.parser.parse_args()
return self.parser.parse_args(input_)
{
"description": "Toponym Combination",
"args": [
{ "short": "input", "help": "Corpus used to learn the embeddings" },
{ "short": "-g", "long": "--glove__dir", "default": "data/glove" },
{"long": "--max_sequence_length", "type":"int","default":15},
{"long": "--max_num_words", "type":"int","default":400000},
{"long": "--embedding_dimension", "type":"int","default":100},
{"long": "--batch_size", "type":"int","default":100},
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }
]
}
\ No newline at end of file
...@@ -7,8 +7,10 @@ ...@@ -7,8 +7,10 @@
{ "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" },
{ "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
{ "long": "--cooc-sample-size", "type": "int", "default": 3 },
{"long": "--adjacency-iteration", "type":"int","default":1}, {"long": "--adjacency-iteration", "type":"int","default":1},
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "long": "--ngram-word2vec-dim", "type": "int", "default": 50 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
......
...@@ -2,6 +2,7 @@ from keras.models import load_model ...@@ -2,6 +2,7 @@ from keras.models import load_model
import tensorflow as tf import tensorflow as tf
import keras.backend as K import keras.backend as K
from utils import NgramIndex from utils import NgramIndex
import numpy as np
from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model from tensorflow.python.keras.models import load_model
...@@ -9,7 +10,41 @@ from tensorflow.python.keras.models import load_model ...@@ -9,7 +10,41 @@ from tensorflow.python.keras.models import load_model
sess = None sess = None
graph = None graph = None
from metrics import lat_accuracy,lon_accuracy def lat_accuracy(LAT_TOL =1/180.):
def accuracy_at_k_lat(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lat
def lon_accuracy(LON_TOL=1/360.):
def accuracy_at_k_lon(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = tf.abs(y_true - y_pred)
fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
return accuracy_at_k_lon
class Geocoder(object): class Geocoder(object):
""" """
...@@ -21,12 +56,12 @@ class Geocoder(object): ...@@ -21,12 +56,12 @@ class Geocoder(object):
if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()`
""" """
def __init__(self,keras_model_fn,ngram_index_file): def __init__(self,keras_model_fn,ngram_index_file):
global sess # global sess
global graph # global graph
sess = tf.compat.v1.Session() # sess = tf.compat.v1.Session()
graph = tf.compat.v1.get_default_graph() # graph = tf.compat.v1.get_default_graph()
set_session(sess) # set_session(sess)
self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy}) self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
self.ngram_encoder = NgramIndex.load(ngram_index_file) self.ngram_encoder = NgramIndex.load(ngram_index_file)
def get_coord(self,toponym,context_toponym): def get_coord(self,toponym,context_toponym):
...@@ -34,9 +69,11 @@ class Geocoder(object): ...@@ -34,9 +69,11 @@ class Geocoder(object):
global graph global graph
p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len)
c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len)
with sess.as_default(): p = np.array(p)
with graph.as_default(): c = np.array(c)
lon,lat = self.keras_model.predict([[p],[c]]) # with sess.as_default():
# with graph.as_default():
lon,lat = self.keras_model.predict([[p],[c]])
return lon[0][0],lat[0][0] return lon[0][0],lat[0][0]
def wgs_coord(self,lon,lat): def wgs_coord(self,lon,lat):
...@@ -61,13 +98,19 @@ class Geocoder(object): ...@@ -61,13 +98,19 @@ class Geocoder(object):
ax.plot(lon,lat,marker='o', color='red', markersize=5) ax.plot(lon,lat,marker='o', color='red', markersize=5)
plt.show() plt.show()
geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index")
top,topc = "Paris","Cherbourg"
lon,lat = geocoder.get_coord(top,topc)
lon,lat = geocoder.wgs_coord(lon,lat)
geocoder.plot_coord("{0},{1}".format(top,topc),lat,lon)
if __name__ == "__main__": if __name__ == "__main__":
from flask import Flask, escape, request, render_template from flask import Flask, escape, request, render_template
app = Flask(__name__) app = Flask(__name__)
geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index")
@app.route('/',methods=["GET"]) @app.route('/',methods=["GET"])
def display(): def display():
......
...@@ -13,7 +13,7 @@ logging.basicConfig( ...@@ -13,7 +13,7 @@ logging.basicConfig(
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from shapely.geometry import Point from shapely.geometry import Point
from utils import Grid from lib.geo import Grid
from tqdm import tqdm from tqdm import tqdm
......
...@@ -14,7 +14,7 @@ logging.basicConfig( ...@@ -14,7 +14,7 @@ logging.basicConfig(
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from shapely.geometry import Point from shapely.geometry import Point
from utils import Grid from lib.geo import Grid
from helpers import read_geonames from helpers import read_geonames
from tqdm import tqdm from tqdm import tqdm
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment