Skip to content
Snippets Groups Projects
Commit b77a89a5 authored by Jacques Fize's avatar Jacques Fize
Browse files

ERROR on the accuracy@k computation resolved

parent c6cd0e18
No related branches found
No related tags found
No related merge requests found
......@@ -18,7 +18,7 @@ from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Add,concatenate,Dropout
from keras.models import Model
from keras.initializers import Constant
from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D
from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D, Conv2D
from keras import backend as K
import tensorflow as tf
......@@ -46,7 +46,7 @@ from chrono import Chronometer
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
level=logging.INFO
)
chrono = Chronometer()
......@@ -56,6 +56,8 @@ GEONAME_FN = args.geoname_input
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
CONV, LSTM_train = False,False
if args.model == "CNN":
......@@ -63,35 +65,35 @@ if args.model == "CNN":
else:
LSTM_train = True
EPOCHS = args.epochs
# LOAD DATA
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values
train_indices,test_indices = set(train_indices),set(test_indices)
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
# RETRIEVE ADJACENCY
# Geometry operation
filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
filtered = gpd.GeoDataFrame(filtered)
filtered["i"]=1
bounds = filtered.dissolve("i").bounds.values[0]
bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships
rel_dict ={}
if args.adjacency:
logging.info("Retrieve inclusion relationships ! ")
fn = "data/geonamesData/{0}_adjacency.json".format(GEONAME_FN.split("/")[-1])
fn = "data/geonamesData/{0}_{1}_adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY)
if not os.path.exists(fn):
g = Grid(*bounds,[360,180])
g.fit_data(filtered)
[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships()]))
rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)]))
json.dump(rel_dict,open(fn,'w'))
else:
logging.info("Open and load data from previous computation!")
......@@ -137,28 +139,38 @@ num_words = len(index.index_ngram) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...")
X_1,X_2,y_lat,y_lon=[],[],[],[]
X_3 = []
X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
for geonameId_1,geonameId_2 in rel_dict.items():
if not geonameId_2 in rel_dict:
continue
geonameId_3 = rel_dict[geonameId_2]
# top3 = geoname2encodedname[geonameId_3]
# X_3.append(top3)
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
X_1.append(top1)
X_2.append(top2)
if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
X_1_train.append(top1)
X_2_train.append(top2)
y_lon_train.append(geoname_vec[geonameId_1][0])
y_lat_train.append(geoname_vec[geonameId_1][1])
else:
X_1_test.append(top1)
X_2_test.append(top2)
y_lon.append(geoname_vec[geonameId_1][0])
y_lat.append(geoname_vec[geonameId_1][1])
y_lon_test.append(geoname_vec[geonameId_1][0])
y_lat_test.append(geoname_vec[geonameId_1][1])
# NUMPYZE inputs and output lists
X_1 = np.array(X_1)
X_2 = np.array(X_2)
X_3 = np.array(X_3)
y_lat = np.array(y_lat)
y_lon = np.array(y_lon)
X_1_train = np.array(X_1_train)
X_2_train = np.array(X_2_train)
y_lat_train = np.array(y_lat_train)
y_lon_train = np.array(y_lon_train)
X_1_test = np.array(X_1_test)
X_2_test = np.array(X_2_test)
y_lat_test = np.array(y_lat_test)
y_lon_test = np.array(y_lon_test)
logging.info("Data prepared !")
......@@ -174,7 +186,7 @@ def accuracy_at_k(y_true, y_pred):
y_pred : tf.Tensor
predicted output
"""
diff = y_true - y_pred
diff = tf.abs(y_true - y_pred)
fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
......@@ -185,56 +197,57 @@ if args.inclusion:
name+="_I"
logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim)
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50)
logging.info("Embedding generated !")
if LSTM_train:
name = "LSTM_"+ name
input_1 = Input(shape=(max_len,))
input_2 = Input(shape=(max_len,))
#input_3 = Input(shape=(1,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = Bidirectional(LSTM(10))(embedding_layer(input_1))
x2 = Bidirectional(LSTM(10))(embedding_layer(input_2))
x1 = Bidirectional(LSTM(98))(embedding_layer(input_1))
x2 = Bidirectional(LSTM(98))(embedding_layer(input_2))
x = concatenate([x1,x2])#,x3])
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
x1 = Dense(500,activation="relu")(x)
#x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
#x1 = Dropout(0.3)(x1)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
x2 = Dense(500,activation="relu")(x)
#x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
#x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3)
history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
if CONV :
name = "CONV_"+ name
input_1 = Input(shape=(max_len,))
input_2 = Input(shape=(max_len,))
#input_3 = Input(shape=(1,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, weights=[embedding_weights],trainable=False)
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)# weights=[embedding_weights],trainable=False)
x1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_1))
x1 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_1))
x1 = Dropout(0.5)(x1)
x1 = MaxPooling1D(pool_size=2)(x1)
x1 = Flatten()(x1)
x2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_2))
x2 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_2))
x2 = Dropout(0.5)(x2)
x2 = MaxPooling1D(pool_size=2)(x2)
x2 = Flatten()(x2)
# x1 = Bidirectional(LSTM(max_len))(embedding_layer(input_1))
# x2 = Bidirectional(LSTM(max_len))(embedding_layer(input_2))
x = concatenate([x1,x2])#,x3])
x = concatenate([x1,x2])
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
......@@ -245,9 +258,9 @@ if CONV :
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.summary()
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3)
history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
hist_df = pd.DataFrame(history.history)
hist_df.to_csv("outputs/{0}.csv".format(name))
\ No newline at end of file
......@@ -6,6 +6,7 @@
{ "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" },
{"long": "--adjacency-iteration", "type":"int","default":10},
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
......
pyroutelib3
#pyroutelib3
node2vec
osrm
#osrm
geopandas
pandas
numpy
......@@ -14,4 +14,4 @@ tensorflow
keras
ngram
shapely
sqlitedict
\ No newline at end of file
sqlitedict
......@@ -58,8 +58,8 @@ class NgramIndex():
ngram_encoding.extend([filling_item]*diff)
return ngram_encoding
def get_embedding_layer(self,texts,dim=100):
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim, window=5, min_count=1, workers=4)
def get_embedding_layer(self,texts,dim=100,**kwargs):
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment