diff --git a/.gitignore b/.gitignore index ddc1507dda5eebc7fbb67e9a3546f78022969b26..96d316e916ade4b9b4e4081aea3e4a2ed935b257 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ dist/ downloads/ eggs/ .eggs/ -lib/ lib64/ parts/ sdist/ diff --git a/combination_embeddings.py b/combination_embeddings.py index c147f8064042bea90976c809d7ca42fe5012369e..025c586caa830963928ba0c47e0a816127966d88 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -12,6 +12,8 @@ import geopandas as gpd from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM from keras.models import Model from keras import backend as K +from keras.callbacks import ModelCheckpoint + import tensorflow as tf # Geometry @@ -19,31 +21,15 @@ from shapely.geometry import Point # Custom module from helpers import read_geonames -from utils import Grid -from utils import zero_one_encoding, NgramIndex,ConfigurationReader -from metrics import lat_accuracy,lon_accuracy +from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader +from lib.metrics import lat_accuracy,lon_accuracy # Logging from tqdm import tqdm import logging -from helpers import Chronometer - - -def parse_title_wiki(title_wiki): - """ - Parse Wikipedia title - - Parameters - ---------- - title_wiki : str - wikipedia title - - Returns - ------- - str - parsed wikipedia title - """ - return re.sub("\(.*\)","",title_wiki).strip().lower() +from helpers import parse_title_wiki def get_new_ids(cooc_data,id_first_value): """ @@ -74,96 +60,122 @@ def get_new_ids(cooc_data,id_first_value): topo_id[id_]=interlink return topo_id - - # LOGGING CONF logging.basicConfig( format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO ) -chrono = Chronometer() args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args()#("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) + .parse_args()#("-i -e 5 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) -# Initialisee CONSTANTS -GEONAME_FN = args.geoname_input -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = 3 -WORDVEC_ITER = 50 +COOC_SAMPLING_NUMBER = args.cooc_sample_size +WORDVEC_ITER = args.ngram_word2vec_dim +################################################# +########## FILENAME VARIABLE #################### +################################################# +GEONAME_FN = args.geoname_input +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 +ADJACENCY_REL_FILENAME = "../data/geonamesData/{0}_{1}{2}adjacency.json".format( + GEONAME_FN.split("/")[-1], + ITER_ADJACENCY, + REGION_SUFFIX_FN) + +COOC_FN = "../data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1] +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + REGION_SUFFIX_FN) -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) + +############################################################################################# +################################# LOAD DATA ################################################# +############################################################################################# # LOAD Geonames DATA logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") -hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") -train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values -train_indices,test_indices = set(train_indices),set(test_indices) +train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) +test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places +#CLEAR RAM +del geoname_data -# IF REGION (ONLY FR for now !) -admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split() -region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1 -if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth: + +# IF REGION +if args.admin_code_1 != "None": filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() -# REDUCE DATA STORED +# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD +bounds = get_bounds(filtered) # Required to get adjacency relationships + -# Geometry operation -filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -filtered = gpd.GeoDataFrame(filtered) -filtered["i"]=1 -bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships +############################################################################################# +################################# RETRIEVE RELATIONSHIPS #################################### +############################################################################################# +# INITIALIZE RELATION STORE rel_store = [] +# Retrieve adjacency relationships if args.adjacency: - # RETRIEVE ADJACENCY REL logging.info("Retrieve adjacency relationships ! ") - fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn) - if not os.path.exists(fn): - g = Grid(*bounds,[360,180]) - g.fit_data(filtered) - [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] - rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)]) - json.dump(rel_store,open(fn,'w')) + + if not os.path.exists(ADJACENCY_REL_FILENAME): + rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) + json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) else: logging.info("Open and load data from previous computation!") - rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))] + rel_store=json.load(open(ADJACENCY_REL_FILENAME)) + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) +# Retrieve inclusion relationships if args.inclusion: - # RETRIEVE INCLUSION RELATIONSHIPS logging.info("Retrieve inclusion relationships ! ") - geonamesIDS = set(filtered.geonameid.values) - filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS)) - rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())) - logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) -del filtered["geometry"] + cpt_rel = len(rel_store) + rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) + + logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) + + if args.wikipedia_cooc: logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1] + cooc_data = pd.read_csv(COOC_FN,sep="\t") cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max()) + id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) @@ -191,37 +203,30 @@ geoname2name = dict(filtered["geonameid name".split()].values) # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) -filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available -if args.wikipedia_cooc: - [index.split_and_add(k) for k in wikipediatitle_id] -filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding -max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length -if args.wikipedia_cooc: - extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()} -index.max_len = int(max_len) # For Index state dump + # Identify all ngram available +filtered.name.apply(lambda x : index.split_and_add(x)) +if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] -filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len -if args.wikipedia_cooc: - extension = {k:index.complete(v,max_len) for k,v in extension.items()} -geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association +geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association if args.wikipedia_cooc: - geoname2encodedname.update(extension) + geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) +# SAVE THE INDEX TO REUSE THE MODEL +index.save(INDEX_FN) logging.info("Done !") -#CLEAR RAM -del hierarchy_data -del geoname_data + +############################################################################################# +################################# ENCODE COORDINATES ################################################# +############################################################################################# + + # Encode each geonames entry coordinates -filtered["cell_vec"]=filtered.apply( - lambda x : zero_one_encoding(x.longitude,x.latitude), - axis=1 - ) -geoname_vec = dict(filtered["geonameid cell_vec".split()].values) +geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} # CLEAR RAM del filtered @@ -231,14 +236,17 @@ num_words = len(index.index_ngram) # necessary for the embedding matrix logging.info("Preparing Input and Output data...") + +############################################################################################# +################################# BUILD TRAIN/TEST DATASETS ################################# +############################################################################################# + X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] -cpt=0 for couple in rel_store: geonameId_1,geonameId_2 = couple[0],couple[1] if not geonameId_1 in geoname2encodedname: - cpt+=1 continue top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] if geonameId_1 in train_indices: #and geonameId_2 in train_indices: @@ -270,29 +278,28 @@ y_lon_test = np.array(y_lon_test) logging.info("Data prepared !") -# OUTPUT FN BASE -name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) -if args.adjacency: - name += "_A" -if args.inclusion: - name += "_I" -if args.wikipedia_cooc: - name += "_C" +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") -index.save("outputs/"+name+"_index") +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# -# NGRAM EMBDEDDING logging.info("Generating N-GRAM Embedding...") embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=WORDVEC_ITER) logging.info("Embedding generated !") -# DEEP MODEL -name = "LSTM_"+ name -input_1 = Input(shape=(max_len,)) -input_2 = Input(shape=(max_len,)) +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# -embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) @@ -315,15 +322,29 @@ output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) + + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + + +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) + validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]), + callbacks=[checkpoint]) hist_df = pd.DataFrame(history.history) -hist_df.to_csv("outputs/{0}.csv".format(name)) +hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN)) -model.save("outputs/"+name+".h5") +model.save(MODEL_OUTPUT_FN) +# Erase Model Checkpoint file +if os.path.exists(output_fn + ".part"): + os.remove(output_fn + ".part") \ No newline at end of file diff --git a/documentation/imgs/first_approach.png b/documentation/imgs/first_approach.png new file mode 100644 index 0000000000000000000000000000000000000000..4b83d1184fc92e154510c934ecea41b4e80455ce Binary files /dev/null and b/documentation/imgs/first_approach.png differ diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png new file mode 100644 index 0000000000000000000000000000000000000000..bdff5964c3796980e518eb0f9aa724bd836e0ca6 Binary files /dev/null and b/documentation/imgs/second_approach.png differ diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png new file mode 100644 index 0000000000000000000000000000000000000000..ea8e6aaa02e19084a61e346ebacff25139cc63cb Binary files /dev/null and b/documentation/imgs/third_approach.png differ diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..0a47034ec466467011042648e9b43a1ccc4a7187 --- /dev/null +++ b/helpers.py @@ -0,0 +1,165 @@ +import os +import time +import re + +import numpy as np +import pandas as pd + + +def read_geonames(file): + """ + Return a dataframe that contains Geonames data. + + Parameters + ---------- + file : str + path of the Geonames Csv file + + Returns + ------- + pd.DataFrame + geonames data + """ + dtypes_dict = { + 0: int, # geonameid + 1: str, # name + 2: str, # asciiname + 3: str, # alternatenames + 4: float, # latitude + 5: float, # longitude + 6: str, # feature class + 7: str, # feature code + 8: str, # country code + 9: str, # cc2 + 10: str, # admin1 code + 11: str, # admin2 code + 12: str, # admin3 code + 13: str, # admin4 code + 14: int, # population + 15: str, # elevation + 16: int, # dem (digital elevation model) + 17: str, # timezone + 18: str # modification date yyyy-MM-dd + } + rename_cols = { + 0:"geonameid", # geonameid + 1:"name", # name + 2:"asciiname", # asciiname + 3:"alternatenames", # alternatenames + 4:"latitude", # latitude + 5:"longitude", # longitude + 6:"feature_class", # feature class + 7:"feature_code", # feature code + 8:"country_code", # country code + 9:"cc2", # cc2 + 10:"admin1_code", # admin1 code + 11:"admin2_code", # admin2 code + 12:"admin3_code", # admin3 code + 13:"admin4_code", # admin4 code + 14:"population", # population + 15:"elevation", # elevation + 16:"dem", # dem (digital elevation model) + 17:"timezone", # timezone + 18:"modification_date" # modification date yyyy-MM-dd + } + data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) + data.rename(columns=rename_cols,inplace=True) + return data + + +def parse_title_wiki(title_wiki): + """ + Parse Wikipedia title + + Parameters + ---------- + title_wiki : str + wikipedia title + + Returns + ------- + str + parsed wikipedia title + """ + return re.sub("\(.*\)","",title_wiki).strip().lower() + + +def _split(lst,n,complete_chunk_value): + """ + Split a list into chunk of n-size. + + Parameters + ---------- + lst : list + input list + n : int + chunk size + complete_chunk_value : object + if last chunk size not equal to n, this value is used to complete it + + Returns + ------- + list + chunked list + """ + chunks = [lst[i:i + n] for i in range(0, len(lst), n)] + if not chunks:return chunks + if len(chunks[-1]) != n: + chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) + return np.array(chunks) + +class Chronometer(): + def __init__(self): + self.__task_begin_timestamp = {} + + def start(self,task_name): + """ + Start a new task chronometer + + Parameters + ---------- + task_name : str + task id + + Raises + ------ + ValueError + if a running task already exists with that name + """ + if task_name in self.__task_begin_timestamp: + raise ValueError("A running task exists with the name {0}!".format(task_name)) + self.__task_begin_timestamp[task_name] = time.time() + + def stop(self,task_name): + """ + Stop and return the duration of the task + + Parameters + ---------- + task_name : str + task id + + Returns + ------- + float + duration of the task in seconds + + Raises + ------ + ValueError + if no task exist with the id `task_name` + """ + if not task_name in self.__task_begin_timestamp: + raise ValueError("The {0} task does not exist!".format(task_name)) + duration = time.time() - self.__task_begin_timestamp[task_name] + del self.__task_begin_timestamp[task_name] + return duration + +if __name__ == "__main__": + chrono = Chronometer() + chrono.start("test") + chrono.start("test2") + time.sleep(3) + print(chrono.stop("test")) + time.sleep(3) + print(chrono.stop("test2")) \ No newline at end of file diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lib/geo.py b/lib/geo.py new file mode 100644 index 0000000000000000000000000000000000000000..c91d2d4f3062fd72f440b6b3f308d6b2827852fd --- /dev/null +++ b/lib/geo.py @@ -0,0 +1,335 @@ + +import geopandas as gpd +import numpy as np +import pandas as pd + +from shapely.geometry import Point,box + +from tqdm import tqdm + + +import pandas as pd, numpy as np +from numba import njit +from helpers import read_geonames +from tqdm import tqdm +from joblib import Parallel,delayed + + + +def haversine_pd(lon1, lat1, lon2, lat2): + lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2 + + return 6367 * 2 * np.arcsin(np.sqrt(a)) + + +def get_adjacent(ids,lon1, lat1, lon2, lat2,threshold): + dist_ = haversine_pd(lon1, lat1, lon2, lat2) + return ids[dist_<threshold] + +def get_geonames_adjacency(geoname_data,threshold): + return Parallel(n_jobs=-1,backend="multiprocessing")(delayed(get_adjacent)(geoname_data.geonameid.values, + geoname_data.longitude, + geoname_data.latitude, + row.longitude, + row.latitude, + threshold) for ix,row in tqdm(geoname_data.iterrows(),total=len(geoname_data))) + + +def generate_couple(object_list): + """ + Return a randomly selected couple from an object list. + + Parameters + ---------- + object_list : list + object list + + Returns + ------- + list + list of coupled object + """ + couples = [] + lst = np.arange(len(object_list)) + for _ in range(len(object_list)): + if len(lst) == 1: + break + idx = np.random.choice(np.arange(len(lst))) + idx2 = np.random.choice(np.arange(len(lst))) + while idx2 == idx: + idx2 = np.random.choice(np.arange(len(lst))) + couples.append([object_list[lst[idx]],object_list[lst[idx2]]]) + lst = np.delete(lst,idx) + return couples + +def _hash_couple(o1,o2): + """ + Return an hash for two object ids. + + Parameters + ---------- + o1 : str or int + id of the first objeeect + o2 : str of int + id of the second object + + Returns + ------- + str + hash + """ + return "|".join(map(str,sorted([int(o1),int(o2)]))) + + + +def zero_one_encoding(long,lat): + """ + Encode coordinates (WGS84) between 0 and 1 + + Parameters + ---------- + long : float + longitude value + lat : float + latitude value + + Returns + ------- + float,float + longitude, latitude + """ + return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) + +class Cell(object): + """ + A cell is box placed in geeographical space. + """ + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,x,y): + """ + Constructor + + Parameters + ---------- + upperleft_x : float + upperleft longitude + upperleft_y : float + upperleft latitude + bottomright_x : float + bottom right longitude + bottomright_y : float + bottom right latitude + x : int + cell x coordinates in the grid + y : int + cell y coordinates in the grid + """ + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) + self.list_object={} # {id:Point(coord)} + + self.x,self.y = x, y + + def contains(self,lat,lon): + """ + Return true if the cell contains a point at given coordinates + + Parameters + ---------- + lat : float + latitude + lon : float + longitude + + Returns + ------- + bool + true if contains + """ + x,y = lon,lat + if x < self.upperleft_x or x > self.bottomright_x: + return False + if y < self.upperleft_y or y > self.bottomright_y: + return False + return True + + def add_object(self,id_,lat,lon): + """ + Connect an object to the cell + + Parameters + ---------- + id_ : int + id + lat : float + latitude + lon : float + longitude + """ + self.list_object[id_] = Point(lon,lat) + + def __repr__(self): + return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) + + +class Grid(object): + """ + Define a grid + + """ + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]): + """ + Constructor + + Parameters + ---------- + upperleft_x : float + upperleft longitude + upperleft_y : float + upperleft latitude + bottomright_x : float + bottom right longitude + bottomright_y : float + bottom right latitude + cell_sub_div_index : list, optional + number of division in both latitude and longitude axis (longitude first), by default [100,50] + """ + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + + self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0] + self.y_r = abs(self.upperleft_y - self.bottomright_y )/cell_sub_div_index[1] + + self.c_x_r = self.x_r/cell_sub_div_index[0] # Redivide + self.c_y_r = self.y_r/cell_sub_div_index[1] + + self.cells = [] + self.inter_cells = [] + for i in range(cell_sub_div_index[1]): + self.cells.append([]) + for j in range(cell_sub_div_index[0]): + self.cells[-1].append(Cell( + self.upperleft_x+j*self.x_r, + self.upperleft_y+i*self.y_r, + self.upperleft_x+((j+1)*self.x_r), + self.upperleft_y+((i+1)*self.y_r), + j,i) + ) + dec_y = 0 + for i in range(cell_sub_div_index[1]): + self.inter_cells.append([]) + dec_x = 0 + for j in range(cell_sub_div_index[0]): + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP + self.upperleft_y+(i*self.y_r)-dec_y, + self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r), + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) + ) + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER + self.upperleft_y+(i*self.y_r)-self.c_y_r, + self.upperleft_x+((j+1)*self.x_r)+self.c_x_r, + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) + ) + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)+dec_x, # CENTER + self.upperleft_y+(i*self.y_r)-self.c_y_r, + self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) + ) + dec_x = self.c_x_r + dec_y = self.c_y_r + + def fit_data(self,data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))): + """ + + To avoid unnecessary check when connecting an entity to one or multiple cells, we + filter cells that does not appears in our geographic context (here countries surface). + + Parameters + ---------- + data : GeoDataFrame + geographic context + """ + world = data + world["nn"] = 1 + dissolved = world.dissolve(by="nn").iloc[0].geometry + new_cells= [] + new_inter_cells=[] + for i in tqdm(range(len(self.cells))): + for j in range(len(self.cells[i])): + if dissolved.intersects(self.cells[i][j].box_): + new_cells.append(self.cells[i][j]) + new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3]) + + self.cells=new_cells + self.inter_cells = new_inter_cells + + + def __add__(self,a): + """ + Add an object to the grid + + Parameters + ---------- + a : tuple + (id, latitude, longitude) + """ + for c1 in range(len(self.cells)): + if self.cells[c1].contains(a[1],a[2]): + self.cells[c1].add_object(*a) + + for c1 in range(len(self.inter_cells)): + if self.inter_cells[c1].contains(a[1],a[2]): + self.inter_cells[c1].add_object(*a) + + def get_adjacent_relationships(self,random_iteration=10): + """ + Return a list of adjacent relationships founds in each cell. + + Parameters + ---------- + random_iteration : int, optional + number of iteration for random selection of adjacency relationships, by default 10 + + Returns + ------- + list + adjacency relationships + """ + relationships = set([]) + for c1 in tqdm(range(len(self.cells))): + for _ in range(random_iteration): + for t in generate_couple(list(self.cells[c1].list_object.keys())): + relationships.add(_hash_couple(t[0],t[1])) + + for c1 in tqdm(range(len(self.inter_cells))): + for _ in range(random_iteration): + for t in generate_couple(list(self.inter_cells[c1].list_object.keys())): + relationships.add(_hash_couple(t[0],t[1])) + return relationships + + + +def get_adjacency_rels(geodataframe,bounds,subdiv_tuple,random_iter_adjacency): + g = Grid(*bounds,subdiv_tuple) + g.fit_data() + [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(geodataframe["geonameid longitude latitude".split()].iterrows(),total=len(geodataframe))] + return [[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(random_iter_adjacency)] + +def get_geonames_inclusion_rel(geonames_data,geonames_hierarchy_data_fn): + geonames_hierarchy_data = pd.read_csv(geonames_hierarchy_data_fn,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") + geonamesIDS = set(geonames_data.geonameid.values) + filter_mask = (geonames_hierarchy_data.childId.isin(geonamesIDS) & geonames_hierarchy_data.parentId.isin(geonamesIDS)) + return (geonames_hierarchy_data[filter_mask]["childId parentId".split()].values.tolist()) + +def get_bounds(geodataframe): + geodataframe["geometry"] = geodataframe["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) + geodataframe = gpd.GeoDataFrame(geodataframe) + geodataframe["i"]=1 + return geodataframe.dissolve("i").bounds.values[0] # Required to get adjacency relationships diff --git a/metrics.py b/lib/metrics.py similarity index 100% rename from metrics.py rename to lib/metrics.py diff --git a/lib/ngram_index.py b/lib/ngram_index.py new file mode 100644 index 0000000000000000000000000000000000000000..4d6d3fdd64ee9148dc38f976a78ff0258bcd53f4 --- /dev/null +++ b/lib/ngram_index.py @@ -0,0 +1,178 @@ +import json + +import numpy as np + +from ngram import NGram + +# Machine learning +from gensim.models import Word2Vec + +class NgramIndex(): + """ + Class used for encoding words in ngram representation + """ + def __init__(self,n): + """ + Constructor + + Parameters + ---------- + n : int + ngram size + """ + self.ngram_gen = NGram(N=n) + + self.size = n + self.ngram_index = {"":0} + self.index_ngram = {0:""} + self.cpt = 0 + self.max_len = 0 + + def split_and_add(self,word): + """ + Split word in multiple ngram and add each one of them to the index + + Parameters + ---------- + word : str + a word + """ + ngrams = word.lower().replace(" ","$") + ngrams = list(self.ngram_gen.split(ngrams)) + [self.add(ngram) for ngram in ngrams] + self.max_len = max(self.max_len,len(ngrams)) + + def add(self,ngram): + """ + Add a ngram to the index + + Parameters + ---------- + ngram : str + ngram + """ + if not ngram in self.ngram_index: + self.cpt+=1 + self.ngram_index[ngram]=self.cpt + self.index_ngram[self.cpt]=ngram + + + def encode(self,word): + """ + Return a ngram representation of a word + + Parameters + ---------- + word : str + a word + + Returns + ------- + list of int + listfrom shapely.geometry import Point,box + of ngram index + """ + ngrams = word.lower().replace(" ","$") + ngrams = list(self.ngram_gen.split(ngrams)) + [self.add(ng) for ng in ngrams if not ng in self.ngram_index] + return self.complete([self.ngram_index[ng] for ng in ngrams],self.max_len) + + def complete(self,ngram_encoding,MAX_LEN,filling_item=0): + """ + Complete a ngram encoded version of word with void ngram. It's necessary for neural network. + + Parameters + ---------- + ngram_encoding : list of int + first encoding of a word + MAX_LEN : int + desired length of the encoding + filling_item : int, optional + ngram index you wish to use, by default 0 + + Returns + ------- + list of int + list of ngram index + """ + assert len(ngram_encoding) <= MAX_LEN + diff = MAX_LEN - len(ngram_encoding) + ngram_encoding.extend([filling_item]*diff) + return ngram_encoding + + def get_embedding_layer(self,texts,dim=100,**kwargs): + """ + Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model. + + Parameters + ---------- + texts : list of [list of int] + list of encoded word + dim : int, optional + embedding dimension, by default 100 + + Returns + ------- + np.array + embedding matrix + """ + model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) + N = len(self.ngram_index) + embedding_matrix = np.zeros((N,dim)) + for i in range(N): + embedding_matrix[i] = model.wv[str(i)] + return embedding_matrix + + def save(self,fn): + """ + + Save the NgramIndex + + Parameters + ---------- + fn : str + output filename + """ + data = { + "ngram_size": self.size, + "ngram_index": self.ngram_index, + "cpt_state": self.cpt, + "max_len_state": self.max_len + } + json.dump(data,open(fn,'w')) + + @staticmethod + def load(fn): + """ + + Load a NgramIndex state from a file. + + Parameters + ---------- + fn : str + input filename + + Returns + ------- + NgramIndex + ngram index + + Raises + ------ + KeyError + raised if a required field does not appear in the input file + """ + try: + data = json.load(open(fn)) + except json.JSONDecodeError: + print("Data file must be a JSON") + for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: + if not key in data: + raise KeyError("{0} field cannot be found in given file".format(key)) + new_obj = NgramIndex(data["ngram_size"]) + new_obj.ngram_index = data["ngram_index"] + new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} + new_obj.cpt = data["cpt_state"] + new_obj.max_len = data["max_len_state"] + return new_obj + diff --git a/lib/utils.py b/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..57326d86efe9bc26da23fcbc34047a2d66ed3a6d --- /dev/null +++ b/lib/utils.py @@ -0,0 +1,80 @@ +# Basic import +import math +import argparse +import os +import json + +# Data Structure +import numpy as np +import geopandas as gpd +from shapely.geometry import Point,box + +# NLP +from nltk.tokenize import word_tokenize +from ngram import NGram + +# Visualisation and parallelisation +from tqdm import tqdm + + +class TokenizerCustom(): + def __init__(self,vocab): + self.word_index = {vocab[i]:i for i in range(len(vocab))} + self.index_word = {i:vocab[i] for i in range(len(vocab))} + self.N = len(self.index_word) + def texts_to_sequences(self,listText): + seqs = [] + for text in listText: + seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index]) + return seqs + + +class ConfigurationReader(object): + def __init__(self,configuration_file): + if not os.path.exists(configuration_file): + raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file)) + + self.configuration = json.load(open(configuration_file)) + + self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"]) + self.parser = argparse.ArgumentParser(description=self.__argparser_desc) + + self.parse_conf() + + def parse_conf(self): + if not "args" in self.configuration: + raise argparse.ArgumentError("","No args given in the configuration file") + + for dict_args in self.configuration["args"]: + if not isinstance(dict_args,dict): + raise ValueError("Args must be dictionnary") + + short_command = dict_args.get("short",None) + long_command = dict_args.get("long",None) + + if not short_command and not long_command: + raise ValueError("No command name was given !") + + add_func_dict_= {} + if "help" in dict_args: + add_func_dict_["help"]= dict_args["help"] + if "default" in dict_args: + add_func_dict_["default"]= dict_args["default"] + if "action" in dict_args: + add_func_dict_["action"]= dict_args["action"] + if "type" in dict_args: + add_func_dict_["type"]= eval(dict_args["type"]) + if "choices" in dict_args: + add_func_dict_["choices"]= dict_args["choices"] + + if not (short_command and long_command): + command = (short_command if not long_command else long_command) + self.parser.add_argument(command,**add_func_dict_) + + elif long_command and short_command: + self.parser.add_argument(short_command,long_command,**add_func_dict_) + + def parse_args(self,input_=None): + if not input_: + return self.parser.parse_args() + return self.parser.parse_args(input_) diff --git a/parser_config/embeddings_lat_lon.json b/parser_config/embeddings_lat_lon.json deleted file mode 100644 index 1a0c774c47b9a6294bf3f54936c79773fc7027a9..0000000000000000000000000000000000000000 --- a/parser_config/embeddings_lat_lon.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "description": "Toponym Combination", - "args": [ - { "short": "input", "help": "Corpus used to learn the embeddings" }, - { "short": "-g", "long": "--glove__dir", "default": "data/glove" }, - {"long": "--max_sequence_length", "type":"int","default":15}, - {"long": "--max_num_words", "type":"int","default":400000}, - {"long": "--embedding_dimension", "type":"int","default":100}, - {"long": "--batch_size", "type":"int","default":100}, - { "short": "-e", "long": "--epochs", "type": "int", "default": 100 } - ] -} \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index a2fd9f120b3e791f17948eba7d02b8e2a34116e3..7f57c885d5149a24db6e7830d9d8fef249f05227 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -7,8 +7,10 @@ { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, + { "long": "--cooc-sample-size", "type": "int", "default": 3 }, {"long": "--adjacency-iteration", "type":"int","default":1}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "long": "--ngram-word2vec-dim", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py index 5dcdb7f81a8fbc28826131b5d1680f3647bf6e68..1cf9221ada921077953f8b689fd75bae790b07ce 100644 --- a/predict_toponym_coordinates.py +++ b/predict_toponym_coordinates.py @@ -2,6 +2,7 @@ from keras.models import load_model import tensorflow as tf import keras.backend as K from utils import NgramIndex +import numpy as np from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.models import load_model @@ -9,7 +10,41 @@ from tensorflow.python.keras.models import load_model sess = None graph = None -from metrics import lat_accuracy,lon_accuracy +def lat_accuracy(LAT_TOL =1/180.): + def accuracy_at_k_lat(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64) + return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) + return accuracy_at_k_lat + +def lon_accuracy(LON_TOL=1/360.): + def accuracy_at_k_lon(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64) + return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) + return accuracy_at_k_lon class Geocoder(object): """ @@ -21,12 +56,12 @@ class Geocoder(object): if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` """ def __init__(self,keras_model_fn,ngram_index_file): - global sess - global graph - sess = tf.compat.v1.Session() - graph = tf.compat.v1.get_default_graph() - set_session(sess) - self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy}) + # global sess + # global graph + # sess = tf.compat.v1.Session() + # graph = tf.compat.v1.get_default_graph() + # set_session(sess) + self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()}) self.ngram_encoder = NgramIndex.load(ngram_index_file) def get_coord(self,toponym,context_toponym): @@ -34,9 +69,11 @@ class Geocoder(object): global graph p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) - with sess.as_default(): - with graph.as_default(): - lon,lat = self.keras_model.predict([[p],[c]]) + p = np.array(p) + c = np.array(c) + # with sess.as_default(): + # with graph.as_default(): + lon,lat = self.keras_model.predict([[p],[c]]) return lon[0][0],lat[0][0] def wgs_coord(self,lon,lat): @@ -61,13 +98,19 @@ class Geocoder(object): ax.plot(lon,lat,marker='o', color='red', markersize=5) plt.show() +geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index") +top,topc = "Paris","Cherbourg" +lon,lat = geocoder.get_coord(top,topc) +lon,lat = geocoder.wgs_coord(lon,lat) +geocoder.plot_coord("{0},{1}".format(top,topc),lat,lon) + if __name__ == "__main__": from flask import Flask, escape, request, render_template app = Flask(__name__) - geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") + geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index") @app.route('/',methods=["GET"]) def display(): diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py index 4748f3edf1813f2dcebe90f5febc68a04490127b..a5366c5839f46be6505d53918121e61b8b0a53c7 100644 --- a/train_test_split_cooccurrence_data.py +++ b/train_test_split_cooccurrence_data.py @@ -13,7 +13,7 @@ logging.basicConfig( from sklearn.model_selection import train_test_split from shapely.geometry import Point -from utils import Grid +from lib.geo import Grid from tqdm import tqdm diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py index ff87967ed111a34283b9ef6fd0623b9eb953e59b..585d26722b3fbe52bd33c564dfa4c187281f48cd 100644 --- a/train_test_split_geonames.py +++ b/train_test_split_geonames.py @@ -14,7 +14,7 @@ logging.basicConfig( from sklearn.model_selection import train_test_split from shapely.geometry import Point -from utils import Grid +from lib.geo import Grid from helpers import read_geonames from tqdm import tqdm diff --git a/utils.py b/utils.py deleted file mode 100644 index db250b77474f8e1a135a373b76461dad485f88c1..0000000000000000000000000000000000000000 --- a/utils.py +++ /dev/null @@ -1,614 +0,0 @@ -# Basic import -import math -import argparse -import os -import json - -# Data Structure -import numpy as np -import geopandas as gpd -from shapely.geometry import Point,box - -# NLP -from nltk.tokenize import word_tokenize -from ngram import NGram - -# Machine learning -from gensim.models import Word2Vec - -# Visualisation and parallelisation -from tqdm import tqdm - - -class TokenizerCustom(): - def __init__(self,vocab): - self.word_index = {vocab[i]:i for i in range(len(vocab))} - self.index_word = {i:vocab[i] for i in range(len(vocab))} - self.N = len(self.index_word) - def texts_to_sequences(self,listText): - seqs = [] - for text in listText: - seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index]) - return seqs - - -class CoordinatesEncoder: - """ - Will be replaced by Grid in grid2.py - """ - def __init__(self, cell_size_lat=0.5, cell_size_lon=0.5): - self.min_lon = -180 - self.max_lon = -(self.min_lon) #  Symetric - self.min_lat = -90 - self.max_lat = -(self.min_lat) # Symetric - - self.ecart_lat = self.max_lat - self.min_lat - self.ecart_lon = self.max_lon - self.min_lon - - self.cell_size_lat = cell_size_lat - self.cell_size_lon = cell_size_lon - - self.unit_size_lat = self.ecart_lat / self.cell_size_lat - self.unit_size_lon = self.ecart_lon / self.cell_size_lon - - def encode(self, lat, lon): - return ( - math.floor(((lat + self.max_lat) / self.ecart_lat) * self.unit_size_lat), - math.floor(((lon + self.max_lon) / self.ecart_lon) * (self.unit_size_lon)) - ) - - def number_lat_cell(self): - return int(self.unit_size_lat) - - def number_lon_cell(self): - return int(self.unit_size_lon) - - def oneDimensionOutputSize(self): - return self.number_lat_cell() * self.number_lon_cell() - - def vector(self, lat, lon): - lat_v, lon_v = np.zeros(self.number_lat_cell()), np.zeros(self.number_lon_cell()) - new_coords = self.encode(lat, lon) - lat_v[int(new_coords[0])] = 1 - lon_v[int(new_coords[1])] = 1 - return lat_v, lon_v - - def vector_flatten(self, lat, lon): - vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible - new_coords = self.encode(lat, lon) - pos = self.number_lat_cell() * (new_coords[0]) + new_coords[1] - vec[pos] = 1 # lon * lon size - return vec - - -class NgramIndex(): - """ - Class used for encoding words in ngram representation - """ - def __init__(self,n): - """ - Constructor - - Parameters - ---------- - n : int - ngram size - """ - self.ngram_gen = NGram(N=n) - - self.size = n - self.ngram_index = {"":0} - self.index_ngram = {0:""} - self.cpt = 0 - self.max_len = 0 - - def split_and_add(self,word): - """ - Split word in multiple ngram and add each one of them to the index - - Parameters - ---------- - word : str - a word - """ - ngrams = word.lower().replace(" ","$") - ngrams = list(self.ngram_gen.split(ngrams)) - [self.add(ngram) for ngram in ngrams] - - def add(self,ngram): - """ - Add a ngram to the index - - Parameters - ---------- - ngram : str - ngram - """ - if not ngram in self.ngram_index: - self.cpt+=1 - self.ngram_index[ngram]=self.cpt - self.index_ngram[self.cpt]=ngram - - def encode(self,word): - """ - Return a ngram representation of a word - - Parameters - ---------- - word : str - a word - - Returns - ------- - list of int - listfrom shapely.geometry import Point,box - of ngram index - """ - ngrams = word.lower().replace(" ","$") - ngrams = list(self.ngram_gen.split(ngrams)) - [self.add(ng) for ng in ngrams if not ng in self.ngram_index] - return [self.ngram_index[ng] for ng in ngrams] - - def complete(self,ngram_encoding,MAX_LEN,filling_item=0): - """ - Complete a ngram encoded version of word with void ngram. It's necessary for neural network. - - Parameters - ---------- - ngram_encoding : list of int - first encoding of a word - MAX_LEN : int - desired length of the encoding - filling_item : int, optional - ngram index you wish to use, by default 0 - - Returns - ------- - list of int - list of ngram index - """ - assert len(ngram_encoding) <= MAX_LEN - diff = MAX_LEN - len(ngram_encoding) - ngram_encoding.extend([filling_item]*diff) - return ngram_encoding - - def get_embedding_layer(self,texts,dim=100,**kwargs): - """ - Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model. - - Parameters - ---------- - texts : list of [list of int] - list of encoded word - dim : int, optional - embedding dimension, by default 100 - - Returns - ------- - np.array - embedding matrix - """ - model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) - N = len(self.ngram_index) - embedding_matrix = np.zeros((N,dim)) - for i in range(N): - embedding_matrix[i] = model.wv[str(i)] - return embedding_matrix - - def save(self,fn): - """ - - Save the NgramIndex - - Parameters - ---------- - fn : str - output filename - """ - data = { - "ngram_size": self.size, - "ngram_index": self.ngram_index, - "cpt_state": self.cpt, - "max_len_state": self.max_len - } - json.dump(data,open(fn,'w')) - - @staticmethod - def load(fn): - """ - - Load a NgramIndex state from a file. - - Parameters - ---------- - fn : str - input filename - - Returns - ------- - NgramIndex - ngram index - - Raises - ------ - KeyError - raised if a required field does not appear in the input file - """ - try: - data = json.load(open(fn)) - except json.JSONDecodeError: - print("Data file must be a JSON") - for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: - if not key in data: - raise KeyError("{0} field cannot be found in given file".format(key)) - new_obj = NgramIndex(data["ngram_size"]) - new_obj.ngram_index = data["ngram_index"] - new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} - new_obj.cpt = data["cpt_state"] - new_obj.max_len = data["max_len_state"] - return new_obj - - -def zero_one_encoding(long,lat): - """ - Encode coordinates (WGS84) between 0 and 1 - - Parameters - ---------- - long : float - longitude value - lat : float - latitude value - - Returns - ------- - float,float - longitude, latitude - """ - return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) - -def _split(lst,n,complete_chunk_value): - """ - Split a list into chunk of n-size. - - Parameters - ---------- - lst : list - input list - n : int - chunk size - complete_chunk_value : object - if last chunk size not equal to n, this value is used to complete it - - Returns - ------- - list - chunked list - """ - chunks = [lst[i:i + n] for i in range(0, len(lst), n)] - if not chunks:return chunks - if len(chunks[-1]) != n: - chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) - return np.array(chunks) - -def generate_couple(object_list): - """ - Return a randomly selected couple from an object list. - - Parameters - ---------- - object_list : list - object list - - Returns - ------- - list - list of coupled object - """ - couples = [] - lst = np.arange(len(object_list)) - for _ in range(len(object_list)): - if len(lst) == 1: - break - idx = np.random.choice(np.arange(len(lst))) - idx2 = np.random.choice(np.arange(len(lst))) - while idx2 == idx: - idx2 = np.random.choice(np.arange(len(lst))) - couples.append([object_list[lst[idx]],object_list[lst[idx2]]]) - lst = np.delete(lst,idx) - return couples - -def _hash_couple(o1,o2): - """ - Return an hash for two object ids. - - Parameters - ---------- - o1 : str or int - id of the first objeeect - o2 : str of int - id of the second object - - Returns - ------- - str - hash - """ - return "|".join(map(str,sorted([int(o1),int(o2)]))) - - - -### GEO ADJAC BEGIN -class Cell(object): - """ - A cell is box placed in geeographical space. - """ - def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,x,y): - """ - Constructor - - Parameters - ---------- - object : [type] - [description] - upperleft_x : float - upperleft longitude - upperleft_y : float - upperleft latitude - bottomright_x : float - bottom right longitude - bottomright_y : float - bottom right latitude - x : int - cell x coordinates in the grid - y : int - cell y coordinates in the grid - """ - self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y - self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) - self.list_object={} # {id:Point(coord)} - - self.x,self.y = x, y - - def contains(self,lat,lon): - """ - Return true if the cell contains a point at given coordinates - - Parameters - ---------- - lat : float - latitude - lon : float - longitude - - Returns - ------- - bool - true if contains - """ - x,y = lon,lat - if x < self.upperleft_x or x > self.bottomright_x: - return False - if y < self.upperleft_y or y > self.bottomright_y: - return False - return True - - def add_object(self,id_,lat,lon): - """ - Connect an object to the cell - - Parameters - ---------- - id_ : int - id - lat : float - latitude - lon : float - longitude - """ - self.list_object[id_] = Point(lon,lat) - - def __repr__(self): - return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) - -class Grid(object): - """ - Define a grid - - """ - def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]): - """ - Constructor - - Parameters - ---------- - upperleft_x : float - upperleft longitude - upperleft_y : float - upperleft latitude - bottomright_x : float - bottom right longitude - bottomright_y : float - bottom right latitude - cell_sub_div_index : list, optional - number of division in both latitude and longitude axis (longitude first), by default [100,50] - """ - self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y - - self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0] - self.y_r = abs(self.upperleft_y - self.bottomright_y )/cell_sub_div_index[1] - - self.c_x_r = self.x_r/cell_sub_div_index[0] # Redivide - self.c_y_r = self.y_r/cell_sub_div_index[1] - - self.cells = [] - self.inter_cells = [] - for i in range(cell_sub_div_index[1]): - self.cells.append([]) - for j in range(cell_sub_div_index[0]): - self.cells[-1].append(Cell( - self.upperleft_x+j*self.x_r, - self.upperleft_y+i*self.y_r, - self.upperleft_x+((j+1)*self.x_r), - self.upperleft_y+((i+1)*self.y_r), - j,i) - ) - dec_y = 0 - for i in range(cell_sub_div_index[1]): - self.inter_cells.append([]) - dec_x = 0 - for j in range(cell_sub_div_index[0]): - self.inter_cells[-1].append(Cell( - self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP - self.upperleft_y+(i*self.y_r)-dec_y, - self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r), - self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, - j,i) - ) - self.inter_cells[-1].append(Cell( - self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER - self.upperleft_y+(i*self.y_r)-self.c_y_r, - self.upperleft_x+((j+1)*self.x_r)+self.c_x_r, - self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, - j,i) - ) - self.inter_cells[-1].append(Cell( - self.upperleft_x+(j*self.x_r)+dec_x, # CENTER - self.upperleft_y+(i*self.y_r)-self.c_y_r, - self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT - self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, - j,i) - ) - dec_x = self.c_x_r - dec_y = self.c_y_r - - def fit_data(self,data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))): - """ - - To avoid unnecessary check when connecting an entity to one or multiple cells, we - filter cells that does not appears in our geographic context (here countries surface). - - Parameters - ---------- - data : GeoDataFrame - geographic context - """ - world = data - world["nn"] = 1 - dissolved = world.dissolve(by="nn").iloc[0].geometry - new_cells= [] - new_inter_cells=[] - for i in tqdm(range(len(self.cells))): - for j in range(len(self.cells[i])): - if dissolved.intersects(self.cells[i][j].box_): - new_cells.append(self.cells[i][j]) - new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3]) - - self.cells=new_cells - self.inter_cells = new_inter_cells - - - def __add__(self,a): - """ - Add an object to the grid - - Parameters - ---------- - a : tuple - (id, latitude, longitude) - """ - for c1 in range(len(self.cells)): - if self.cells[c1].contains(a[1],a[2]): - self.cells[c1].add_object(*a) - - for c1 in range(len(self.inter_cells)): - if self.inter_cells[c1].contains(a[1],a[2]): - self.inter_cells[c1].add_object(*a) - - def get_adjacent_relationships(self,random_iteration=10): - """ - Return a list of adjacent relationships founds in each cell. - - Parameters - ---------- - random_iteration : int, optional - number of iteration for random selection of adjacency relationships, by default 10 - - Returns - ------- - list - adjacency relationships - """ - relationships = set([]) - for c1 in tqdm(range(len(self.cells))): - for i in range(random_iteration): - for t in generate_couple(list(self.cells[c1].list_object.keys())): - relationships.add(_hash_couple(t[0],t[1])) - - for c1 in tqdm(range(len(self.inter_cells))): - for i in range(random_iteration): - for t in generate_couple(list(self.inter_cells[c1].list_object.keys())): - relationships.add(_hash_couple(t[0],t[1])) - return relationships - - -### GEO ADJAC END - -class ConfigurationReader(object): - def __init__(self,configuration_file): - if not os.path.exists(configuration_file): - raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file)) - - self.configuration = json.load(open(configuration_file)) - - self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"]) - self.parser = argparse.ArgumentParser(description=self.__argparser_desc) - - self.parse_conf() - - def parse_conf(self): - if not "args" in self.configuration: - raise argparse.ArgumentError("","No args given in the configuration file") - - for dict_args in self.configuration["args"]: - if not isinstance(dict_args,dict): - raise ValueError("Args must be dictionnary") - - short_command = dict_args.get("short",None) - long_command = dict_args.get("long",None) - - if not short_command and not long_command: - raise ValueError("No command name was given !") - - add_func_dict_= {} - if "help" in dict_args: - add_func_dict_["help"]= dict_args["help"] - if "default" in dict_args: - add_func_dict_["default"]= dict_args["default"] - if "action" in dict_args: - add_func_dict_["action"]= dict_args["action"] - if "type" in dict_args: - add_func_dict_["type"]= eval(dict_args["type"]) - if "choices" in dict_args: - add_func_dict_["choices"]= dict_args["choices"] - - if not (short_command and long_command): - command = (short_command if not long_command else long_command) - self.parser.add_argument(command,**add_func_dict_) - - elif long_command and short_command: - self.parser.add_argument(short_command,long_command,**add_func_dict_) - - def parse_args(self,input_=None): - if not input_: - return self.parser.parse_args() - return self.parser.parse_args(input_) - - - -if __name__ == "__main__": - - index = NgramIndex(3) - index.split_and_add("J'aime le paté") - encoding = index.encode("xxxyyyy") - index.complete(encoding,10) \ No newline at end of file