diff --git a/combination_embeddings.py b/combination_embeddings.py index c459f59c8085cd5cc26dac5c4851a2de5fbbf47c..72a8f3b72e5c77ae51cdc1742c13c27b3f93d2e1 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -18,7 +18,7 @@ from keras.layers import Conv1D, MaxPooling1D, Embedding from keras.layers import Add,concatenate,Dropout from keras.models import Model from keras.initializers import Constant -from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D +from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D, Conv2D from keras import backend as K import tensorflow as tf @@ -46,7 +46,7 @@ from chrono import Chronometer logging.basicConfig( format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO + level=logging.INFO ) chrono = Chronometer() @@ -56,6 +56,8 @@ GEONAME_FN = args.geoname_input GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration CONV, LSTM_train = False,False if args.model == "CNN": @@ -63,35 +65,35 @@ if args.model == "CNN": else: LSTM_train = True -EPOCHS = args.epochs - # LOAD DATA logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") + +train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values +train_indices,test_indices = set(train_indices),set(test_indices) + logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places - -# RETRIEVE ADJACENCY - +# Geometry operation filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) filtered = gpd.GeoDataFrame(filtered) filtered["i"]=1 -bounds = filtered.dissolve("i").bounds.values[0] +bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships rel_dict ={} if args.adjacency: logging.info("Retrieve inclusion relationships ! ") - fn = "data/geonamesData/{0}_adjacency.json".format(GEONAME_FN.split("/")[-1]) + fn = "data/geonamesData/{0}_{1}_adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY) if not os.path.exists(fn): g = Grid(*bounds,[360,180]) g.fit_data(filtered) [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] - rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships()])) + rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])) json.dump(rel_dict,open(fn,'w')) else: logging.info("Open and load data from previous computation!") @@ -137,28 +139,38 @@ num_words = len(index.index_ngram) # necessary for the embedding matrix logging.info("Preparing Input and Output data...") -X_1,X_2,y_lat,y_lon=[],[],[],[] -X_3 = [] +X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] +X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] + for geonameId_1,geonameId_2 in rel_dict.items(): if not geonameId_2 in rel_dict: continue - geonameId_3 = rel_dict[geonameId_2] - # top3 = geoname2encodedname[geonameId_3] - # X_3.append(top3) - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - X_1.append(top1) - X_2.append(top2) + if geonameId_1 in train_indices: #and geonameId_2 in train_indices: + + X_1_train.append(top1) + X_2_train.append(top2) + + y_lon_train.append(geoname_vec[geonameId_1][0]) + y_lat_train.append(geoname_vec[geonameId_1][1]) + + else: + X_1_test.append(top1) + X_2_test.append(top2) - y_lon.append(geoname_vec[geonameId_1][0]) - y_lat.append(geoname_vec[geonameId_1][1]) + y_lon_test.append(geoname_vec[geonameId_1][0]) + y_lat_test.append(geoname_vec[geonameId_1][1]) # NUMPYZE inputs and output lists -X_1 = np.array(X_1) -X_2 = np.array(X_2) -X_3 = np.array(X_3) -y_lat = np.array(y_lat) -y_lon = np.array(y_lon) +X_1_train = np.array(X_1_train) +X_2_train = np.array(X_2_train) +y_lat_train = np.array(y_lat_train) +y_lon_train = np.array(y_lon_train) + +X_1_test = np.array(X_1_test) +X_2_test = np.array(X_2_test) +y_lat_test = np.array(y_lat_test) +y_lon_test = np.array(y_lon_test) logging.info("Data prepared !") @@ -174,7 +186,7 @@ def accuracy_at_k(y_true, y_pred): y_pred : tf.Tensor predicted output """ - diff = y_true - y_pred + diff = tf.abs(y_true - y_pred) fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) @@ -185,56 +197,57 @@ if args.inclusion: name+="_I" logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim) +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50) logging.info("Embedding generated !") if LSTM_train: name = "LSTM_"+ name input_1 = Input(shape=(max_len,)) input_2 = Input(shape=(max_len,)) - #input_3 = Input(shape=(1,)) embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - x1 = Bidirectional(LSTM(10))(embedding_layer(input_1)) - x2 = Bidirectional(LSTM(10))(embedding_layer(input_2)) + x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) + x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) x = concatenate([x1,x2])#,x3]) - x = Dense(500,activation="relu")(x) - x = Dropout(0.3)(x) - x = Dense(500,activation="relu")(x) - x = Dropout(0.3)(x) + x1 = Dense(500,activation="relu")(x) + #x1 = Dropout(0.3)(x1) + x1 = Dense(500,activation="relu")(x1) + #x1 = Dropout(0.3)(x1) - output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x) - output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) + x2 = Dense(500,activation="relu")(x) + #x2 = Dropout(0.3)(x2) + x2 = Dense(500,activation="relu")(x2) + #x2 = Dropout(0.3)(x2) + + output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) + output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) - history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3) + history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) if CONV : name = "CONV_"+ name input_1 = Input(shape=(max_len,)) input_2 = Input(shape=(max_len,)) - #input_3 = Input(shape=(1,)) - embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, weights=[embedding_weights],trainable=False) + embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)# weights=[embedding_weights],trainable=False) - x1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_1)) + x1 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_1)) x1 = Dropout(0.5)(x1) x1 = MaxPooling1D(pool_size=2)(x1) x1 = Flatten()(x1) - x2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_2)) + x2 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_2)) x2 = Dropout(0.5)(x2) x2 = MaxPooling1D(pool_size=2)(x2) x2 = Flatten()(x2) - # x1 = Bidirectional(LSTM(max_len))(embedding_layer(input_1)) - # x2 = Bidirectional(LSTM(max_len))(embedding_layer(input_2)) - x = concatenate([x1,x2])#,x3]) + x = concatenate([x1,x2]) x = Dense(500,activation="relu")(x) x = Dropout(0.3)(x) @@ -245,9 +258,9 @@ if CONV : output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - + model.summary() model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) - history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3) + history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) hist_df = pd.DataFrame(history.history) hist_df.to_csv("outputs/{0}.csv".format(name)) \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index 7a39df683be8cbe01750b087b6e8cd0ede2959da..22249bb81b373f06d455ff4501e9dca31ee18172 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -6,6 +6,7 @@ { "short": "-v", "long": "--verbose", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, + {"long": "--adjacency-iteration", "type":"int","default":10}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/requirements.txt b/requirements.txt index 35fa742aeaa7281e69134be28b8ed040b7050f44..8919e090bad02f5962b735aad51bee73b5e8694d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -pyroutelib3 +#pyroutelib3 node2vec -osrm +#osrm geopandas pandas numpy @@ -14,4 +14,4 @@ tensorflow keras ngram shapely -sqlitedict \ No newline at end of file +sqlitedict diff --git a/utils.py b/utils.py index 1827b58024a35e772a425800f8fa5665499325a4..a468525a265e54111a2d62da15aca544b17f0e04 100644 --- a/utils.py +++ b/utils.py @@ -58,8 +58,8 @@ class NgramIndex(): ngram_encoding.extend([filling_item]*diff) return ngram_encoding - def get_embedding_layer(self,texts,dim=100): - model = Word2Vec([[str(w) for w in t] for t in texts], size=dim, window=5, min_count=1, workers=4) + def get_embedding_layer(self,texts,dim=100,**kwargs): + model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) N = len(self.ngram_index) embedding_matrix = np.zeros((N,dim)) for i in range(N):