Skip to content
Snippets Groups Projects
Commit 3487bb5d authored by Jacques Fize's avatar Jacques Fize
Browse files

DEBUG + ADD SCRIPTT FOR GENERATE DATASET

parent 9c9269b3
No related branches found
No related tags found
No related merge requests found
......@@ -19,12 +19,14 @@ parser.add_argument("models_directory")
args = parser.parse_args()
EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv"
def eval_model(eval_dataset_fn,model_fn,model_index_fn):
df = pd.read_csv(eval_dataset_fn,index_col=0)
print("Dataset -- {0} -- Model -- {1}".format(\
eval_dataset_fn.split("/")[-1],
model_fn.split("/")[-1]))
df = pd.read_csv(eval_dataset_fn)
geocoder = Geocoder(model_fn,model_index_fn)
lon,lat = geocoder.get_coords(df.name1.values,df.name2.values)
lon,lat = geocoder.wgs_coord(lon,lat)
......@@ -34,9 +36,6 @@ def eval_model(eval_dataset_fn,model_fn,model_index_fn):
df["dist"] = haversine_pd(df.longitude,df.latitude,df.p_longitude,df.p_latitude)
print("Dataset -- {0} -- Model -- {1}".format(\
eval_dataset_fn.split("/")[-1],
model_fn.split("/")[-1]))
print("100km",(df.dist<100).sum()/len(df))
print("50km",(df.dist<50).sum()/len(df))
print("20km",(df.dist<20).sum()/len(df))
......
......@@ -11,7 +11,7 @@ class NgramIndex():
"""
Class used for encoding words in ngram representation
"""
def __init__(self,n):
def __init__(self,n,loaded = False):
"""
Constructor
......@@ -28,6 +28,7 @@ class NgramIndex():
self.cpt = 0
self.max_len = 0
self.loaded = loaded
def split_and_add(self,word):
"""
Split word in multiple ngram and add each one of them to the index
......@@ -74,8 +75,9 @@ class NgramIndex():
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams],self.max_len)
if not self.loaded:
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
"""
......@@ -95,6 +97,8 @@ class NgramIndex():
list of int
list of ngram index
"""
if self.loaded and len(ngram_encoding) >=MAX_LEN:
return ngram_encoding[:MAX_LEN]
assert len(ngram_encoding) <= MAX_LEN
diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff)
......@@ -169,7 +173,7 @@ class NgramIndex():
for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = NgramIndex(data["ngram_size"])
new_obj = NgramIndex(data["ngram_size"],loaded=True)
new_obj.ngram_index = data["ngram_index"]
new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
new_obj.cpt = data["cpt_state"]
......
import pandas as pd
import re
#### TODO NEED TO add ARGPARSE !!!
def parse_title_wiki(title_wiki):
"""
Parse Wikipedia title
Parameters
----------
title_wiki : str
wikipedia title
Returns
-------
str
parsed wikipedia title
"""
return re.sub("\(.*\)", "", title_wiki).strip().lower()
df = pd.read_csv("./cooccurrence_US_FR.txt",sep="\t")
df["interlinks"] = df.interlinks.apply(lambda x : x.split("|"))
df["interlinks"] = df.interlinks.apply(lambda x : [parse_title_wiki(i) for i in x])
df["title"] = df.title.apply(parse_title_wiki)
def generated_inputs(x):
output = []
for interlink in x.interlinks:
output.append([x.title,interlink,x.longitude,x.latitude])
return output
output_ = []
for ix,row in df.iterrows():
output_.extend(generated_inputs(row))
new_df = pd.DataFrame(output_,columns="name1 name2 longitude latitude".split())
new_df = new_df.sample(frac=1)
new_df.to_csv("us_fr_cooc_test.csv",index=False)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment