diff --git a/desamb_eval.py b/desamb_eval.py index 7544cec68dfeea7595294d83023f08b49785523e..ce0c0024d0858427f79de15f45769461de800720 100644 --- a/desamb_eval.py +++ b/desamb_eval.py @@ -19,12 +19,14 @@ parser.add_argument("models_directory") args = parser.parse_args() - EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv" def eval_model(eval_dataset_fn,model_fn,model_index_fn): - df = pd.read_csv(eval_dataset_fn,index_col=0) + print("Dataset -- {0} -- Model -- {1}".format(\ + eval_dataset_fn.split("/")[-1], + model_fn.split("/")[-1])) + df = pd.read_csv(eval_dataset_fn) geocoder = Geocoder(model_fn,model_index_fn) lon,lat = geocoder.get_coords(df.name1.values,df.name2.values) lon,lat = geocoder.wgs_coord(lon,lat) @@ -34,9 +36,6 @@ def eval_model(eval_dataset_fn,model_fn,model_index_fn): df["dist"] = haversine_pd(df.longitude,df.latitude,df.p_longitude,df.p_latitude) - print("Dataset -- {0} -- Model -- {1}".format(\ - eval_dataset_fn.split("/")[-1], - model_fn.split("/")[-1])) print("100km",(df.dist<100).sum()/len(df)) print("50km",(df.dist<50).sum()/len(df)) print("20km",(df.dist<20).sum()/len(df)) diff --git a/lib/ngram_index.py b/lib/ngram_index.py index 4d6d3fdd64ee9148dc38f976a78ff0258bcd53f4..0cef529be6f408fa79adef775889338619d082f9 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -11,7 +11,7 @@ class NgramIndex(): """ Class used for encoding words in ngram representation """ - def __init__(self,n): + def __init__(self,n,loaded = False): """ Constructor @@ -28,6 +28,7 @@ class NgramIndex(): self.cpt = 0 self.max_len = 0 + self.loaded = loaded def split_and_add(self,word): """ Split word in multiple ngram and add each one of them to the index @@ -74,8 +75,9 @@ class NgramIndex(): """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) - [self.add(ng) for ng in ngrams if not ng in self.ngram_index] - return self.complete([self.ngram_index[ng] for ng in ngrams],self.max_len) + if not self.loaded: + [self.add(ng) for ng in ngrams if not ng in self.ngram_index] + return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len) def complete(self,ngram_encoding,MAX_LEN,filling_item=0): """ @@ -95,6 +97,8 @@ class NgramIndex(): list of int list of ngram index """ + if self.loaded and len(ngram_encoding) >=MAX_LEN: + return ngram_encoding[:MAX_LEN] assert len(ngram_encoding) <= MAX_LEN diff = MAX_LEN - len(ngram_encoding) ngram_encoding.extend([filling_item]*diff) @@ -169,7 +173,7 @@ class NgramIndex(): for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: if not key in data: raise KeyError("{0} field cannot be found in given file".format(key)) - new_obj = NgramIndex(data["ngram_size"]) + new_obj = NgramIndex(data["ngram_size"],loaded=True) new_obj.ngram_index = data["ngram_index"] new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} new_obj.cpt = data["cpt_state"] diff --git a/scripts/generate_cooc_geocoding_dataset.py b/scripts/generate_cooc_geocoding_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..232f40b3b0cdb0ee59698759be3f1b5e1c2c874f --- /dev/null +++ b/scripts/generate_cooc_geocoding_dataset.py @@ -0,0 +1,41 @@ +import pandas as pd +import re + +####Â TODO NEED TO add ARGPARSE !!! +def parse_title_wiki(title_wiki): + """ + Parse Wikipedia title + + Parameters + ---------- + title_wiki : str + wikipedia title + + Returns + ------- + str + parsed wikipedia title + """ + return re.sub("\(.*\)", "", title_wiki).strip().lower() + + +df = pd.read_csv("./cooccurrence_US_FR.txt",sep="\t") + +df["interlinks"] = df.interlinks.apply(lambda x : x.split("|")) +df["interlinks"] = df.interlinks.apply(lambda x : [parse_title_wiki(i) for i in x]) + +df["title"] = df.title.apply(parse_title_wiki) + +def generated_inputs(x): + output = [] + for interlink in x.interlinks: + output.append([x.title,interlink,x.longitude,x.latitude]) + return output + +output_ = [] +for ix,row in df.iterrows(): + output_.extend(generated_inputs(row)) + +new_df = pd.DataFrame(output_,columns="name1 name2 longitude latitude".split()) +new_df = new_df.sample(frac=1) +new_df.to_csv("us_fr_cooc_test.csv",index=False) \ No newline at end of file