Skip to content
Snippets Groups Projects
Commit 12946ad6 authored by Jacques Fize's avatar Jacques Fize
Browse files

DEBUG + ADD DataGenerator for BigData+ Add Script for generating ngram...

DEBUG + ADD DataGenerator for BigData+ Add Script for generating ngram embedding once + desamb eval script
parent 3487bb5d
No related branches found
No related tags found
No related merge requests found
...@@ -148,4 +148,5 @@ notes.md ...@@ -148,4 +148,5 @@ notes.md
other/* other/*
test* test*
nohup.out nohup.out
log* log*
\ No newline at end of file temp*
...@@ -161,7 +161,7 @@ if args.admin_code_1 != "None": ...@@ -161,7 +161,7 @@ if args.admin_code_1 != "None":
# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS # GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
bounds = get_bounds(filtered) # Required to get adjacency relationships
############################################################################################# #############################################################################################
...@@ -177,6 +177,7 @@ if args.adjacency: ...@@ -177,6 +177,7 @@ if args.adjacency:
logging.info("Retrieve adjacency relationships ! ") logging.info("Retrieve adjacency relationships ! ")
if not os.path.exists(ADJACENCY_REL_FILENAME): if not os.path.exists(ADJACENCY_REL_FILENAME):
bounds = get_bounds(filtered) # Required to get adjacency relationships
rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
else: else:
......
import os
from gzip import GzipFile
import keras
import numpy as np
import pandas as pd
from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors
def wc_l(filename,gzip=True):
lc = 0
if not gzip:
f = open(filename)
if gzip:
f = GzipFile(filename)
while f.readline():
lc += 1
f.close()
return lc
class SamplingProbabilities:
def __init__(self):
self.count = {}
def get_probs(self,item):
if not item in self.count:
self.count[item] = 0
self.count[item]+=1
return 1/self.count[item]
def __call__(self,a):
return self.get_probs(a)
class DataSource(object):
def __init__(self,name,input_filename):
self.name = name
assert os.path.exists(input_filename)
self.input_filename = input_filename
self.len = 0
def __next__(self):
raise NotImplementedError()
def __iter__(self):
return self
def __len__(self):
return self.len
def __reset__(self):
raise NotImplementedError()
def isOver(self):
raise NotImplementedError()
class Adjacency(DataSource):
def __init__(self,filename,geonames_filename,sampling=3,len_=None,gzip=True):
super().__init__("Adjacency SRC",filename)
assert os.path.exists(geonames_filename)
self.geonames_data_dict = {row.geonameid:row.name for row in read_geonames(geonames_filename).itertuples()}
self.gzip = gzip
if not self.gzip:
self.data_src = open(self.input_filename,'rb')
else:
self.data_src = GzipFile(self.input_filename,'rb')
if len_:
self.len = len_*sampling
else:
self.len = wc_l(filename,gzip=gzip)
self.data_src.readline() # header line
self.sampling = sampling
if self.sampling:
self.probs_storage = SamplingProbabilities()
self.topo = None
self.context_topo_context = []
self.curr_probs = None
self.lat, self.lon = None, None
self.i = 0
self.is_over = False
def __next__(self):
if self.i >= len(self.context_topo_context):
line = self.data_src.readline()
if not line:
self.is_over = True
raise StopIteration
line = line.decode("utf-8").rstrip("\n")
geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(","))
self.topo = int(geonameid)
self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")]
if self.sampling:
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = float(latitude),float(longitude)
self.i = 0
self.i += 1
return (self.geonames_data_dict[self.topo],
self.geonames_data_dict[self.context_topo_context[self.i-1]],
self.lat,self.lon)
def __reset__(self):
if not self.gzip:
self.data_src = open(self.input_filename,'rb')
else:
self.data_src = GzipFile(self.input_filename,'rb')
self.data_src.readline() # header line
self.is_over = False
def isOver(self):
return self.is_over
class Inclusion(DataSource):
def __init__(self, geonames_filename,hierarchy_filename,mask_ids=None):
super().__init__("Inclusion SRC",hierarchy_filename)
assert os.path.exists(geonames_filename)
self.geonames_data_dict = {row.geonameid:(row.name,row.latitude,row.longitude) for row in read_geonames(geonames_filename).itertuples()}
self.data_src = pd.read_csv(self.input_filename,
sep="\t",
header=None,
names="parentId,childId,type".split(",")
).fillna("")
if mask_ids:
self.data_src = self.data_src[self.data_src.childId.isin(mask_ids)]
self.data_src= self.data_src[self.data_src.childId.isin(self.geonames_data_dict)]
self.data_src= self.data_src[self.data_src.parentId.isin(self.geonames_data_dict)]
self.data_src = self.data_src["childId parentId".split()].values.tolist()
self.len = len(self.data_src)
self.i = 0
self.is_over = False
def __next__(self):
if self.i+1 >= self.len:
self.eof = True
raise StopIteration
else:
self.i += 1
tup_ = tuple(self.data_src[self.i-1])
return (self.geonames_data_dict[tup_[0]][0],
self.geonames_data_dict[tup_[1]][0],
self.geonames_data_dict[tup_[0]][1],
self.geonames_data_dict[tup_[0]][2])
def __reset__(self):
self.i = 0
self.is_over = False
def isOver(self):
return (self.i == self.len)
class CoOccurrences(DataSource):
def __init__(self, filename, sampling=3):
super().__init__("Co-Occurrence data",filename)
try:
self.data_src = pd.read_csv(filename)
except:
self.data_src = pd.read_csv(filename,sep="\t")
self.data_src["title"] = self.data_src.title.apply(parse_title_wiki)
self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki)
self.i = 0
self.j = 0
self.is_over = False
self.sampling = sampling
self.len = len(self.data_src)*self.sampling
if self.sampling:
self.probs_storage = SamplingProbabilities()
self.topo = None
self.context_topo_context = []
self.curr_probs = None
self.lat, self.lon = None, None
def __next__(self):
if self.isOver() or self.i*self.sampling == self.len:
self.is_over = True
raise StopIteration
if self.j >= len(self.context_topo_context):
line = self.data_src.iloc[self.i]
self.topo = line.title
self.context_topo_context = [x for x in line.interlinks.split("|")]
if self.sampling:
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = line.latitude,line.longitude
self.i += 1
self.j = 0
self.j += 1
return (self.topo,
self.context_topo_context[self.j-1],
self.lat,self.lon)
def __reset__(self):
self.i = 0
self.is_over = False
def isOver(self):
return self.is_over
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self,data_sources,ngram_index,**kwargs):
'Initialization'
self.data_src = data_sources
self.ngram_index = ngram_index
self.batch_size = kwargs.get("batch_size",1000)
self.len = sum([len(d) for d in self.data_src])
self.datasrc_index = 0
#self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(self.len / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
X = np.empty((self.batch_size,2,self.ngram_index.max_len))
y = np.empty((self.batch_size,2),dtype=float)
if self.data_src[self.datasrc_index].isOver():
self.datasrc_index += 1
if self.datasrc_index >= len(self.data_src):
return X,y
for i in range(self.batch_size):
if self.data_src[self.datasrc_index].isOver():
return X, y
try:
topo, topo_context,latitude,longitude = self.data_src[self.datasrc_index].__next__()
except StopIteration as e:
return X, y
X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
y[i] = [longitude,latitude]
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
[d.__reset__() for d in self.data_src]
self.datasrc_index = 0
def load_embedding(model_fn,dim_vector=100):
model = KeyedVectors.load(model_fn)
N = len(model.wv.vocab)
M = np.zeros((N,dim_vector))
for i in range(N):
M[i] = model.wv[str(i)]
return M
if __name__ == "__main__":
# All adj nb of line :7955000-1
from lib.ngram_index import NgramIndex
from tqdm import tqdm
ng = NgramIndex.load("../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json")
c= CoOccurrences("../data/wikipedia/cooccurrence_FR.txt_test.csv",sampling=3)
a = Adjacency("/home/jacques/sample_adjacency.txt",geonames_filename="../data/geonamesData/allCountries.txt",gzip=False,sampling=10)
i= Inclusion(geonames_filename="../data/geonamesData/allCountries.txt",hierarchy_filename="../data/geonamesData/hierarchy.txt")
d= DataGenerator([c,a,i],ng)
for x in tqdm(range(len(d))):d[i]
from glob import glob from glob import glob
import os import json
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # No need for GPU
import argparse import argparse
import logging import logging
import pandas as pd import pandas as pd
from predict_toponym_coordinates import Geocoder
from lib.geo import haversine_pd
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)
logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL)
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("eval_dataset") parser.add_argument("eval_dataset")
parser.add_argument("models_directory") parser.add_argument("models_directory")
args = parser.parse_args() parser.add_argument("-g","--gpu",action="store_true")
args = parser.parse_args()#("-g ../data/geocoding_evaluation/fr_cooc_test.csv outputs/FR_RESULT".split())
if not args.gpu:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # No need for GPU
from predict_toponym_coordinates import Geocoder
from lib.geo import haversine_pd
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)
logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL)
EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv" EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv"
...@@ -39,7 +46,17 @@ def eval_model(eval_dataset_fn,model_fn,model_index_fn): ...@@ -39,7 +46,17 @@ def eval_model(eval_dataset_fn,model_fn,model_index_fn):
print("100km",(df.dist<100).sum()/len(df)) print("100km",(df.dist<100).sum()/len(df))
print("50km",(df.dist<50).sum()/len(df)) print("50km",(df.dist<50).sum()/len(df))
print("20km",(df.dist<20).sum()/len(df)) print("20km",(df.dist<20).sum()/len(df))
return df
prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")] prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")]
final_output = []
for prefix in prefixes: for prefix in prefixes:
eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index") df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
\ No newline at end of file data = json.load(open(prefix+".json"))
data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
final_output.append(data)
pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv")))
\ No newline at end of file
python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT
#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/US\ FR\ results
#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/US\ FR\ results
import pandas as pd, numpy as np
from numba import njit
from helpers import read_geonames
from tqdm import tqdm
from joblib import Parallel,delayed
import geopandas as gpd
from lib.geo import Grid,haversine_pd
import matplotlib.pyplot as plt
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("geoname_fn")
parser.add_argument("kilometer_threshold",type=int,default=20)
parser.add_argument("output_fn_prefix")
args = parser.parse_args("../data/geonamesData/allCountries.txt 20 /home/jacques/ALL_ADJ_224+_".split())
GEONAME_FN = args.geoname_fn
PREFIX_OUTPUT_FN = args.output_fn_prefix
KM_THRESHOLD = args.kilometer_threshold
df = read_geonames(GEONAME_FN)
def to_str(list_):
"""
Return str representation for each value in list_
Parameters
----------
list_ : array
array
Returns
-------
array
str list
"""
return list(map(str,list_))
def get_adjacent(geonameid,ids,lon1, lat1, lon2, lat2,threshold):
"""
Write adjacent entry in geonames for a selected entry
"""
dist_ = haversine_pd(lon1, lat1, lon2, lat2)
adj_ids = ids[dist_<threshold]
out_.write("\n{0},{1},{2},{3}".format(geonameid,"|".join(to_str(adj_ids)),lat2,lon2))
out_.flush()
# WE BUILD a grid over the world map
# It allows to limit unnecessary calculus thus accelerate the whole process
world = gpd.read_file("/media/jacques/DATA/GEODATA/WORLD/world.geo.50m.dissolved")
g = Grid(*world.bounds.values[0],[40,20]) #We build a grid of cell of 40° by 20°
g.fit_data(world)
# Prepare first output
first_output_fn = "{1}{0}_cells.csv".format(KM_THRESHOLD,PREFIX_OUTPUT_FN)
out_ = open(first_output_fn,'w')
out_.write("geonameid,adjacent_geonameid,latitude,longitude") # HEADER
out_.flush() # Avoid writing bugs
def get_rels(cells_list):
for c in tqdm(cells_list):
mask1 = (df.latitude <= c.bottomright_y) & (df.latitude >= c.upperleft_y)
new_df = df[mask1].copy()
mask2 = (new_df.longitude >= c.upperleft_x) & (new_df.longitude <= c.bottomright_x)
new_df = new_df[mask2]
for ix,row in new_df.iterrows():
get_adjacent(row.geonameid,new_df.geonameid.values,new_df.longitude,new_df.latitude,row.longitude,row.latitude,KM_THRESHOLD)
#Parallel(n_jobs=-1,backend="multiprocessing",temp_folder="/home/jacques/temp/")(delayed(get_adjacent)(row.geonameid,new_df.geonameid.values,new_df.longitude,new_df.latitude,row.longitude,row.latitude,KM_THRESHOLD) for ix,row in new_df.iterrows())
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax = world.plot(color="white",edgecolor="black")
for c in g.cells[224:]:
ax.plot(*c.box_.exterior.xy)
plt.show()
get_rels(g.cells[224:]) #~3h
# Prepare second output
# second_output_fn = "{1}{0}_inter_cells.csv".format(KM_THRESHOLD,PREFIX_OUTPUT_FN)
# out_ = open(second_output_fn,'w')
# out_.write("geonameid,adjacent_geonameid,latitude,longitude") # HEADER
# out_.flush()# Avoid writing bugs
# get_rels(g.inter_cells) 594
...@@ -17,6 +17,25 @@ from joblib import Parallel,delayed ...@@ -17,6 +17,25 @@ from joblib import Parallel,delayed
def haversine_pd(lon1, lat1, lon2, lat2): def haversine_pd(lon1, lat1, lon2, lat2):
"""
Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
Parameters
----------
lon1 : numeric or array-like (pandas Dataframe works also)
longitude of first coordinates
lat1 : numeric or array-like (pandas Dataframe works also)
latitude of first coordinates
lon2 : numeric or array-like (pandas Dataframe works also)
longitude of second coordinates
lat2 : numeric or array-like (pandas Dataframe works also)
longitude of second coordinates
Returns
-------
float or array-like
distance(s) value(s)
"""
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
dlon = lon2 - lon1 dlon = lon2 - lon1
dlat = lat2 - lat1 dlat = lat2 - lat1
......
#!/usr/bin/env python
# coding: utf-8
from lib.ngram_index import NgramIndex
from lib.geo import read_geonames
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tqdm
from gensim.models import Word2Vec
import logging
logging.basicConfig(level="INFO")
df_cooc = pd.read_csv("../data/wikipedia/cooccurrence_ALL.txt",sep="\t")
df_geo = read_geonames("../data/geonamesData/allCountries.txt")
geonames_label = df_geo.name.values.tolist()
wiki_labels = df_cooc.title.values.tolist()
p= [wiki_labels.extend(x.split("|")) for x in df_cooc["interlinks"].values]
del df_geo
del df_cooc
ng = NgramIndex(4)
p = [ng.split_and_add(x) for x in tqdm(geonames_label)]
p = [ng.split_and_add(x) for x in tqdm(wiki_labels)]
ng.save("4gramWiki+Geonames_index.json")
geonames_label.extend(wiki_labels)
class MySentences(object):
def __init__(self, texts):
self.texts = texts
def __iter__(self):
for w in self.texts:
yield [str(x)for x in ng.encode(w)]
model = Word2Vec(MySentences(geonames_label), size=100, window=5, min_count=1, workers=4)
model.save("embedding4gramWiki+Geonames.bin")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment