# Base module import os import sys from argparse import ArgumentParser import json # Structure import pandas as pd import numpy as np import geopandas as gpd # DEEPL module from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.layers import Dense, Input, GlobalMaxPooling1D from keras.layers import Conv1D, MaxPooling1D, Embedding from keras.layers import Add,concatenate,Dropout from keras.models import Model from keras.initializers import Constant from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D from keras import backend as K import tensorflow as tf # Geometry from shapely.geometry import Point # Custom module from helpers import read_geonames from utils import Grid from utils import zero_one_encoding, NgramIndex,ConfigurationReader # Visualisation module import matplotlib.pyplot as plt from tqdm import tqdm as tqdm_base def tqdm(*args, **kwargs): if hasattr(tqdm_base, '_instances'): for instance in list(tqdm_base._instances): tqdm_base._decr_instances(instance) return tqdm_base(*args, **kwargs) # Logging import logging from chrono import Chronometer logging.basicConfig( format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO ) chrono = Chronometer() args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) GEONAME_FN = args.geoname_input GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value CONV, LSTM_train = False,False if args.model == "CNN": CONV = True else: LSTM_train = True EPOCHS = args.epochs # LOAD DATA logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places # RETRIEVE ADJACENCY filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) filtered = gpd.GeoDataFrame(filtered) filtered["i"]=1 bounds = filtered.dissolve("i").bounds.values[0] rel_dict ={} if args.adjacency: fn = "{0}_adjacency.json".format(GEONAME_FN.split("/")[-1]) if not os.path.exists(fn): g = Grid(*bounds,[360,180]) g.fit_data(filtered) [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships()])) json.dump(rel_dict,open(fn,'w')) else: rel_dict.update(json.load(open(fn,'w'))) if args.inclusion: # RETRIEVE INCLUSION RELATIONSHIPS logging.info("Retrieve inclusion relationships ! ") geoname2name = dict(filtered["geonameid name".split()].values) filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values)) logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association logging.info("Done !") #CLEAR RAM del hierarchy_data del geoname_data # Encode each geonames entry coordinates filtered["cell_vec"]=filtered.apply( lambda x : zero_one_encoding(x.longitude,x.latitude), axis=1 ) geoname_vec = dict(filtered["geonameid cell_vec".split()].values) # CLEAR RAM del filtered embedding_dim = 256 num_words = len(index.index_ngram) # necessary for the embedding matrix logging.info("Preparing Input and Output data...") X_1,X_2,y_lat,y_lon=[],[],[],[] X_3 = [] for geonameId_1,geonameId_2 in rel_dict.items(): if not geonameId_2 in rel_dict: continue geonameId_3 = rel_dict[geonameId_2] # top3 = geoname2encodedname[geonameId_3] # X_3.append(top3) top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] X_1.append(top1) X_2.append(top2) y_lon.append(geoname_vec[geonameId_1][0]) y_lat.append(geoname_vec[geonameId_1][1]) # NUMPYZE inputs and output lists X_1 = np.array(X_1) X_2 = np.array(X_2) X_3 = np.array(X_3) y_lat = np.array(y_lat) y_lon = np.array(y_lon) logging.info("Data prepared !") def accuracy_at_k(y_true, y_pred): """ Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible task for neural network to obtain the exact coordinate. Parameters ---------- y_true : tf.Tensor truth data y_pred : tf.Tensor predicted output """ diff = y_true - y_pred fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE) logging.info("Generating N-GRAM Embedding...") embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim) logging.info("Embedding generated !") if LSTM_train: name = "LSTM_"+ name input_1 = Input(shape=(max_len,)) input_2 = Input(shape=(max_len,)) #input_3 = Input(shape=(1,)) embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) x1 = Bidirectional(LSTM(10))(embedding_layer(input_1)) x2 = Bidirectional(LSTM(10))(embedding_layer(input_2)) x = concatenate([x1,x2])#,x3]) x = Dense(500,activation="relu")(x) x = Dropout(0.3)(x) x = Dense(500,activation="relu")(x) x = Dropout(0.3)(x) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3) if CONV : name = "CONV_"+ name input_1 = Input(shape=(max_len,)) input_2 = Input(shape=(max_len,)) #input_3 = Input(shape=(1,)) embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True) x1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_1)) x1 = Dropout(0.5)(x1) x1 = MaxPooling1D(pool_size=2)(x1) x1 = Flatten()(x1) x2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_2)) x2 = Dropout(0.5)(x2) x2 = MaxPooling1D(pool_size=2)(x2) x2 = Flatten()(x2) # x1 = Bidirectional(LSTM(max_len))(embedding_layer(input_1)) # x2 = Bidirectional(LSTM(max_len))(embedding_layer(input_2)) x = concatenate([x1,x2])#,x3]) x = Dense(500,activation="relu")(x) x = Dropout(0.3)(x) x = Dense(500,activation="relu")(x) x = Dropout(0.3)(x) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3) hist_df = pd.DataFrame(history.history) hist_df.to_csv("outputs/{0}.csv".format(name))