diff --git a/README.md b/README.md index 3eee5842dba0dccf55dd541f5655c33ea92186ee..a1ba1e30ae16b4daae480a39e23d14079ec63d20 100755 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ For Anaconda users <hr> -## First approach : Embedding using places Wikipedia pages +## Embedding using places Wikipedia pages <div style="text-align:center"> <img src="documentation/imgs/first_approach.png"/> @@ -86,84 +86,3 @@ The different outputs (on for each neural network architecture) are put in the ` * outputs/Bi-GRU_100dim_20epoch_1000batch__place_type.png : **place type accuracy plot** * outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history** * outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings** - -<hr> - -## 2nd Approach: Geonames place embedding - -From this point, we change our vantage point by focusing our model propositions by using heavily spatial/geographical data, in this context gazetteer. In this second approach, we propose to generate an embedding for places (not place's toponym) based on their topology. - -In order to do that, we use Geonames data to build a topology graph. This graph is generated based on intersection found between place buffer intersection. - -(image ici) - -Then, using topology network, we use node-embedding techniques to generate an embedding for each vertex (places). - -<div style="text-align:center"> -<img src="documentation/imgs/second_approach.png"/> -<p><strong>Figure 2</strong> : Second approach general workflow</p> -</div> - -### Generate the embedding - -First, download the Geonames dump : [here](https://download.geonames.org/export/dump/) - -*N.B.* We advise you to take only the data from one country ! Topology network can be really dense and large ! - - python3 geonames_embedding.py <geonames dump(*.txt)> - -### Available Parameters - -| Parameter | Description (default) | -|------------------------|-------------------------------------------------------------------| -| --nbcpu | Number of CPU used for during the learning phase | -| --vector-size | Embedding size | -| --walk-length | Generated walk length | -| --num-walks | Number of walks for each vertex (place) | -| --word2vec-window-size | Window-size used in Word2vec | -| --buffer-size | Buffer size used to detect adjacency relationships between places | -| -d | Integrate distances between places in the topology graph | -| --dist | Distance used if '-d' | - -### Output files - -Gensim word2vec format is saved in the execution directory. - -<hr> - -## Embedding : train using concatenation of close places - -<div style="text-align:center"> -<img src="documentation/imgs/third_approach.png"/> -<p><strong>Figure 3</strong> : Third approach general workflow</p> -</div> - - -### Prepare required data - - * download the Geonames data use to train the network [here](download.geonames.org/export/dump/) - * download the hierarchy data [here](http://download.geonames.org/export/dump/hierarchy.zip) - * unzip both file in the directory of your choice - * run the script `train_test_split_geonames.py <geoname_filename>` - -### Train the network - -The script `combination_embeddings.py` is the one responsible of the neural network training - -To train the network with default parameter use the following command : - - python3 combination_embeddings.py -a -i <geoname data filename> <hierarchy geonames data filename> - -### Available parameters - - -| Parameter | Description | -|----------------------|----------------------------------------------------------------------| -| -i,--inclusion | Use inclusion relationships to train the network | -| -a,--adjacency | Use adjacency relationships to train the network | -| -w,--wikipedia-coo | Use Wikipedia place co-occurrences to train the network | -| -n,--ngram-size | ngram size | -| -t,--tolerance-value | K-value in the computation of the accuracy@k | -| -e,--epochs | number of epochs | -| -d,--dimension | size of the ngram embeddings | -| --admin_code_1 | (Optional) If you wish to train the network on a specificate region | diff --git a/combination_embeddings.py b/combination_embeddings.py deleted file mode 100755 index c147f8064042bea90976c809d7ca42fe5012369e..0000000000000000000000000000000000000000 --- a/combination_embeddings.py +++ /dev/null @@ -1,329 +0,0 @@ -# Base module -import re -import os -import json - -#Â Structure -import pandas as pd -import numpy as np -import geopandas as gpd - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM -from keras.models import Model -from keras import backend as K -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - -#Â Custom module -from helpers import read_geonames -from utils import Grid -from utils import zero_one_encoding, NgramIndex,ConfigurationReader -from metrics import lat_accuracy,lon_accuracy - -# Logging -from tqdm import tqdm -import logging -from helpers import Chronometer - - -def parse_title_wiki(title_wiki): - """ - Parse Wikipedia title - - Parameters - ---------- - title_wiki : str - wikipedia title - - Returns - ------- - str - parsed wikipedia title - """ - return re.sub("\(.*\)","",title_wiki).strip().lower() - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - - - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) -chrono = Chronometer() - -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ - .parse_args()#("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) - -# Initialisee CONSTANTS -GEONAME_FN = args.geoname_input -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.tolerance_value -EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = 3 -WORDVEC_ITER = 50 - -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") -hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") - -train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values -train_indices,test_indices = set(train_indices),set(test_indices) - -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places - -# IF REGION (ONLY FR for now !) -admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split() -region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1 -if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth: - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() - -# REDUCE DATA STORED -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD - -# Geometry operation -filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -filtered = gpd.GeoDataFrame(filtered) -filtered["i"]=1 -bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships - - -rel_store = [] - -if args.adjacency: - # RETRIEVE ADJACENCY REL - logging.info("Retrieve adjacency relationships ! ") - fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn) - if not os.path.exists(fn): - g = Grid(*bounds,[360,180]) - g.fit_data(filtered) - [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] - rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)]) - json.dump(rel_store,open(fn,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))] - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -if args.inclusion: - # RETRIEVE INCLUSION RELATIONSHIPS - logging.info("Retrieve inclusion relationships ! ") - geonamesIDS = set(filtered.geonameid.values) - filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS)) - rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())) - logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) - -del filtered["geometry"] - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1] - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - -# ENCODING NAME USING N-GRAM SPLITTING -logging.info("Encoding toponyms to ngram...") -index = NgramIndex(NGRAM_SIZE) -filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available -if args.wikipedia_cooc: - [index.split_and_add(k) for k in wikipediatitle_id] -filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding -max_len = filtered.encode_name.apply(len).max() #Â Retrieve the encodings max length -if args.wikipedia_cooc: - extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()} - -index.max_len = int(max_len) #Â For Index state dump - -filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len -if args.wikipedia_cooc: - extension = {k:index.complete(v,max_len) for k,v in extension.items()} -geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association - -if args.wikipedia_cooc: - geoname2encodedname.update(extension) - - -logging.info("Done !") - -#CLEAR RAM -del hierarchy_data -del geoname_data - -# Encode each geonames entry coordinates -filtered["cell_vec"]=filtered.apply( - lambda x : zero_one_encoding(x.longitude,x.latitude), - axis=1 - ) -geoname_vec = dict(filtered["geonameid cell_vec".split()].values) -# CLEAR RAM -del filtered - - -embedding_dim = 256 -num_words = len(index.index_ngram) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - -X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] -X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] - -cpt=0 -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - cpt+=1 - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_1_train.append(top1) - X_2_train.append(top2) - - y_lon_train.append(geoname_vec[geonameId_1][0]) - y_lat_train.append(geoname_vec[geonameId_1][1]) - - else: - X_1_test.append(top1) - X_2_test.append(top2) - - y_lon_test.append(geoname_vec[geonameId_1][0]) - y_lat_test.append(geoname_vec[geonameId_1][1]) - -# NUMPYZE inputs and output lists -X_1_train = np.array(X_1_train) -X_2_train = np.array(X_2_train) -y_lat_train = np.array(y_lat_train) -y_lon_train = np.array(y_lon_train) - -X_1_test = np.array(X_1_test) -X_2_test = np.array(X_2_test) -y_lat_test = np.array(y_lat_test) -y_lon_test = np.array(y_lon_test) - -logging.info("Data prepared !") - - -# OUTPUT FN BASE -name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) -if args.adjacency: - name += "_A" -if args.inclusion: - name += "_I" -if args.wikipedia_cooc: - name += "_C" - -index.save("outputs/"+name+"_index") - - -# NGRAM EMBDEDDING -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=WORDVEC_ITER) -logging.info("Embedding generated !") - -# DEEP MODEL -name = "LSTM_"+ name -input_1 = Input(shape=(max_len,)) -input_2 = Input(shape=(max_len,)) - -embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - -x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) -x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) - -x = concatenate([x1,x2])#,x3]) - -x1 = Dense(500,activation="relu")(x) -#x1 = Dropout(0.3)(x1) -x1 = Dense(500,activation="relu")(x1) -#x1 = Dropout(0.3)(x1) - -x2 = Dense(500,activation="relu")(x) -#x2 = Dropout(0.3)(x2) -x2 = Dense(500,activation="relu")(x2) -#x2 = Dropout(0.3)(x2) - -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - -model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - -model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) -history = model.fit(x=[X_1_train,X_2_train], - y=[y_lon_train,y_lat_train], - verbose=True, batch_size=100, - epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv("outputs/{0}.csv".format(name)) - -model.save("outputs/"+name+".h5") - diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png deleted file mode 100755 index bdff5964c3796980e518eb0f9aa724bd836e0ca6..0000000000000000000000000000000000000000 Binary files a/documentation/imgs/second_approach.png and /dev/null differ diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png deleted file mode 100755 index ea8e6aaa02e19084a61e346ebacff25139cc63cb..0000000000000000000000000000000000000000 Binary files a/documentation/imgs/third_approach.png and /dev/null differ diff --git a/metrics.py b/metrics.py deleted file mode 100755 index e82c54809aa2a6bece60cd74875140d3719c1ea6..0000000000000000000000000000000000000000 --- a/metrics.py +++ /dev/null @@ -1,37 +0,0 @@ -import tensorflow as tf - -def lat_accuracy(LAT_TOL =1/180.): - def accuracy_at_k_lat(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64) - return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) - return accuracy_at_k_lat - -def lon_accuracy(LON_TOL=1/360.): - def accuracy_at_k_lon(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64) - return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) - return accuracy_at_k_lon \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json deleted file mode 100755 index a2fd9f120b3e791f17948eba7d02b8e2a34116e3..0000000000000000000000000000000000000000 --- a/parser_config/toponym_combination_embedding.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "description": "Toponym Combination", - "args": [ - { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, - { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." }, - { "short": "-v", "long": "--verbose", "action": "store_true" }, - { "short": "-i", "long": "--inclusion", "action": "store_true" }, - { "short": "-a", "long": "--adjacency", "action": "store_true" }, - { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, - {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, - { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, - { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, - { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "long": "--admin_code_1", "default": "None" } - ] -} \ No newline at end of file diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py deleted file mode 100755 index 5dcdb7f81a8fbc28826131b5d1680f3647bf6e68..0000000000000000000000000000000000000000 --- a/predict_toponym_coordinates.py +++ /dev/null @@ -1,80 +0,0 @@ -from keras.models import load_model -import tensorflow as tf -import keras.backend as K -from utils import NgramIndex - -from tensorflow.python.keras.backend import set_session -from tensorflow.python.keras.models import load_model - -sess = None -graph = None - -from metrics import lat_accuracy,lon_accuracy - -class Geocoder(object): - """ - >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") - >>>lon,lat = geocoder.get_coord("Paris","New-York") - >>>lon,lat = geocoder.wgs_coord(lon,lat) - >>>geocoder.plot_coord("Paris,New-York",lat,lon) - - if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` - """ - def __init__(self,keras_model_fn,ngram_index_file): - global sess - global graph - sess = tf.compat.v1.Session() - graph = tf.compat.v1.get_default_graph() - set_session(sess) - self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy}) - self.ngram_encoder = NgramIndex.load(ngram_index_file) - - def get_coord(self,toponym,context_toponym): - global sess - global graph - p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) - c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) - with sess.as_default(): - with graph.as_default(): - lon,lat = self.keras_model.predict([[p],[c]]) - return lon[0][0],lat[0][0] - - def wgs_coord(self,lon,lat): - return ((lon*360)-180),((lat*180)-90) - - def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs): - if interactive_map: - import folium - import tempfile - import webbrowser - fp = tempfile.NamedTemporaryFile(delete=False) - m = folium.Map() - folium.Marker([lat, lon], popup=toponym).add_to(m) - m.save(fp.name) - webbrowser.open('file://' + fp.name) - else: - import matplotlib.pyplot as plt - import geopandas - fig, ax = plt.subplots(1,**kwargs) - world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) - world.plot(color='white', edgecolor='black',ax=ax) - ax.plot(lon,lat,marker='o', color='red', markersize=5) - plt.show() - -if __name__ == "__main__": - from flask import Flask, escape, request, render_template - - app = Flask(__name__) - - - geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") - - @app.route('/',methods=["GET"]) - def display(): - toponym = request.args.get("top", "Paris") - c_toponym = request.args.get("c_top", "Cherbourg") - lon,lat = geocoder.get_coord(toponym,c_toponym) - lon,lat = geocoder.wgs_coord(lon,lat) - return render_template("skeleton.html",lat=lat,lon=lon) - - app.run(host='0.0.0.0') \ No newline at end of file diff --git a/scripts/classificationEmbeddings.py b/scripts/classificationEmbeddings.py deleted file mode 100755 index 2943861f3727b0838de8d77dea231d685c7cc15b..0000000000000000000000000000000000000000 --- a/scripts/classificationEmbeddings.py +++ /dev/null @@ -1,32 +0,0 @@ - -import pandas as pd -df= pd.read_csv("dbpediaPlaceClassification.csv") -import numpy as np -def loadGloveModel(gloveFile): - print("Loading Glove Model") - f = open(gloveFile,'r') - model = {} - for line in f: - splitLine = line.split() - word = splitLine[0] - embedding = np.array([float(val) for val in splitLine[1:]]) - model[word] = embedding - print("Done.",len(model)," words loaded!") - return model -model = loadGloveModel("data/glove/glove.6B.100d.txt") -def getEmb(x,model): - emb = np.zeros(100) - for word in x.split(): - word =word.lower() - if word in model: - emb+=model[word] - return emb - -df["embeddings"] = df["Place"].apply(lambda x : getEmb(x,model)) -df.to_msgpack("dbpediaPlaceEmbedding.msg") - -import json -data = json.load(open("classname.json")) -df2 = pd.DataFrame(data.items(),columns="WID label".split()) -df2["embeddings"] = df2["label"].apply(lambda x:getEmb(x,model)) -df2.to_msgpack("classnameEmbedding.msg") \ No newline at end of file diff --git a/scripts/evalgeonamesembeddings.py b/scripts/evalgeonamesembeddings.py deleted file mode 100755 index c7d346dd4a58940a1c0beb1e5f3a5782489b52bd..0000000000000000000000000000000000000000 --- a/scripts/evalgeonamesembeddings.py +++ /dev/null @@ -1,70 +0,0 @@ -# Evaluation process -import gensim -import glob -import re -import gensim -import random -from helpers import * -from scipy.spatial.distance import cosine -from shapely.geometry import Point -from scipy.stats.stats import pearsonr - -import pandas as pd -import geopandas as gpd - -from tqdm import tqdm - -NPAIR = 100000 -fns = glob.glob("data/embeddings/*.bin") - -def get_data(fn): - data = [int(x) for x in re.findall("\d+",fn)] - if not len(data) == 4: - return {"embedding_size":data[0], - "walk_length":data[1], - "number_of_walks":data[2], - "word2vec_window_size":data[3], - "filepath":fn, - "noise":data[4] - } - #raise Exception("filename should have 4 integers") - return { - "embedding_size":data[0], - "walk_length":data[1], - "number_of_walks":data[2], - "word2vec_window_size":data[3], - "filepath":fn - } - -df = read_geonames("./data/geonamesData/FR.txt") -df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1) - -# Create GeoDataFrame for faster spatial comparison operations -gdf = gpd.GeoDataFrame(df) - -# Select a sample that concerns the departement "La Manche" -manche_gdf = gdf[gdf.admin2_code == "50"].copy() - -df =pd.DataFrame([get_data(fn) for fn in fns]) - -def get_pearsons(model): - manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid - coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values) - places = list(coords.keys()) - geodesic_d = [] - embeddings_d = [] - for i in tqdm(range(NPAIR),disable=True): - placeA=random.choice(places) - placeB=random.choice(places) - geodesic_d.append(coords[placeA].distance(coords[placeB])) - embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)])) - return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value - -df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0]) -df.fillna(0,inplace=True) -df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno') -plt.show() -df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno') -plt.show() -df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno') -plt.show() \ No newline at end of file diff --git a/scripts/evaluation-dbpedia-types.py b/scripts/evaluation-dbpedia-types.py deleted file mode 100755 index d1ef557279a1e7c062b45e8d8581c1b3b2137abc..0000000000000000000000000000000000000000 --- a/scripts/evaluation-dbpedia-types.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import random -import numpy as np -from owlready2 import * -from vincenty import vincenty -from scipy.stats.stats import pearsonr - -BASE_DIR = '' -GLOVE_DIR = os.path.join(BASE_DIR, '.') -NUM_PAIRS = 100000 - -print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.') -embeddings_index = {} -listText = [] -with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f: - for line in f: - word, coefs = line.split(maxsplit=1) - coefs = np.fromstring(coefs, 'f', sep=' ') - embeddings_index[word] = coefs - listText.append(word) -print('Found %s word vectors.' % len(embeddings_index)) - -print('Collecting data from DBPedia ontology, downloaded from http://downloads.dbpedia.org/2014/dbpedia_2014.owl.bz2.') -onto = get_ontology("dbpedia_2014.owl") -onto.load() -def retrive_desc( concept , old_names=[] ): - desc = list( concept.descendants( include_self=False) ) - names = list( [ re.sub(r'.+\.', '', repr(concept)) + "/" + re.sub(r'.+\.', '', repr(c)) for c in desc ] ) - names = [x for x in names if x not in old_names] - desc = [ desc[x] for x in range(len(names)) if names[x] not in old_names ] - new_desc = list(desc) - for i in desc: - n1, d1 = retrive_desc(i, names + old_names) - for j in range(len(n1)): - new_desc.append( d1[j] ) - names.append( re.sub(r'.+\.', '', repr(concept)) + "/" + n1[j] ) - return names, new_desc -names, _ = retrive_desc(onto.Place) -names = [ n.lower() for n in set(names) if re.sub(r'.+/', '', n.lower()) in embeddings_index ] # check if the name of the place type exists in the embeddings matrix - -print('Generating pairs of place names.') -name_pairs = [] -similarity_pairs = [] -distance_pairs = [] -for num in range(NUM_PAIRS): - name1 = random.choice(names) - name2 = random.choice(names) - if name1 == name2: continue - name_pairs.append( (name1,name2) ) - dist = 0.0 - n1 = name1.split('/') - n2 = name2.split('/') - for i in range(min(len(n1),len(n2))): - if n1[i] == n2[i]: dist += 1.0 - else: break - similarity_pairs.append(dist) # Similarity between the place types, given by the number of ancestors in common - distance_pairs.append( np.sqrt(np.sum((embeddings_index[re.sub(r'.+/', '', name1)] - embeddings_index[re.sub(r'.+/', '', name2)])**2)) ) # Euclidean distance between the embeddings - -result = pearsonr( distance_pairs , similarity_pairs) # Compute Pearson correlation and associated p-value -print(result) - diff --git a/scripts/evaluation-geonames.py b/scripts/evaluation-geonames.py deleted file mode 100755 index 3f3b847f8d9f980d2a29c4bfef574f85d44235fb..0000000000000000000000000000000000000000 --- a/scripts/evaluation-geonames.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import random -import numpy as np -from vincenty import vincenty -from scipy.stats.stats import pearsonr - -BASE_DIR = '' -GLOVE_DIR = os.path.join(BASE_DIR, 'data/glove') -NUM_PAIRS = 100000 - -print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.') -embeddings_index = {} -listText = [] -with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f: - for line in f: - word, coefs = line.split(maxsplit=1) - coefs = np.fromstring(coefs, 'f', sep=' ') - embeddings_index[word] = coefs - listText.append(word) -print('Found %s word vectors.' % len(embeddings_index)) - -print('Collecting data from geonames downloaded from http://download.geonames.org/export/dump/allCountries.zip.') -file = open("data/geonamesData/allCountries.txt", "r") -placenames = { } -for line in file: - line = line.split("\t") - name = line[1].lower() - if " " in name or not(name in embeddings_index): # check if the main name exists in the embeddings matrix - names = line[3].split(",") - for n in names: - n = n.strip().lower() - if not(" " in n) and (n in embeddings_index): # if not, check if any of the alternative names exists in the embeddings matrix - name = n - break - if " " in name or not(name in embeddings_index): continue - placenames.update( { name : (float(line[4]), float(line[5])) } ) - -from scipy.spatial.distance import cosine -from tqdm import tqdm - -print('Generating pairs of place names.') -NUM_PAIRS = 1000 -name_pairs = [] -geo_distance_pairs = [] -distance_pairs = [] -for num in tqdm(range(NUM_PAIRS)): - name1 = random.choice(list(placenames.keys())) - name2 = random.choice(list(placenames.keys())) - if name1 == name2: continue - name_pairs.append( (name1,name2) ) - try: - distance_pairs.append(cosine(embeddings_index[name1], embeddings_index[name2])) - geo_distance_pairs.append( vincenty(placenames[name1], placenames[name2]) ) # Geospatial distance between the place names, given by Vincenty's geodetic formulae # Cosine distance between the embeddings - except: - pass -geo_distance_pairs= np.array(geo_distance_pairs).astype(float) -distance_pairs = np.nan_to_num(distance_pairs,nan=np.nanmax(distance_pairs)) -geo_distance_pairs = np.nan_to_num(geo_distance_pairs,nan=1) - - -result = pearsonr( geo_distance_pairs , distance_pairs) # Compute Pearson correlation and associated p-value -print(result) \ No newline at end of file diff --git a/scripts/extractWikidataClasseName.py b/scripts/extractWikidataClasseName.py deleted file mode 100755 index 10740194a303f026f53ff997124853e573f5f73e..0000000000000000000000000000000000000000 --- a/scripts/extractWikidataClasseName.py +++ /dev/null @@ -1,53 +0,0 @@ -import json -import argparse -import time - -from SPARQLWrapper import SPARQLWrapper,JSON -from urllib.request import HTTPError - -from tqdm import tqdm - -parser = argparse.ArgumentParser() -parser.add_argument("available_class_filename",help="JSON file that contains an array of string. Each string is a Wikidata id (e.g. Q30)") -parser.add_argument("output_filename") -args = parser.parse_args() - -ids= json.load(open(args.available_class_filename)) - -def get_label(id_wikidata): - sparql = SPARQLWrapper("https://query.wikidata.org/sparql") - sparql.setQuery(""" - PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> - PREFIX wd: <http://www.wikidata.org/entity/> - select * - where { - wd:"""+id_wikidata+ """ rdfs:label ?label . - FILTER (langMatches( lang(?label), "EN" ) ) - } - LIMIT 1""" ) - sparql.setReturnFormat(JSON) - results = sparql.query().convert() - time.sleep(0.1) - try: - return results["results"]["bindings"][0]["label"]["value"] - except: - return "" - - -t = 0 -dict_results = {} -progress_bar = tqdm(total=len(ids)) -while t<len(ids): - try: - dict_results[ids[t]]=get_label(ids[t]) - except HTTPError as e: - time.sleep(1) - continue - progress_bar.update(1) - t+=1 -progress_bar.close() - -json.dump(dict_results,open(args.output_filename,'w')) - - - diff --git a/scripts/filterDataWithtopNclasse.py b/scripts/filterDataWithtopNclasse.py deleted file mode 100755 index e516a3db60e9b72dce11ffe1c4bd0c210c51dd09..0000000000000000000000000000000000000000 --- a/scripts/filterDataWithtopNclasse.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import pandas as pd -import numpy as np - -TOPN = 100 - -df = pd.read_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv",sep="\t",names="ID title url latitude longitude instance_of".split())[1:] -df = df.fillna("") -df["instance_of"] = df.instance_of.apply(lambda x: str(x).split("_")) - - -count = {} -for list_type in df.instance_of.values: - #print(list_type) - for type_ in list_type: - if not type_ in count: count[type_]=0 - count[type_]+=1 - -# modify count for CLASS IMPORTANT BUT NOT FREQUENT BECAUSE OF A HIGH GRANULARITY -to_increase = [ - "Q6256",#country - "Q5119",#capital - "Q27554677",#former capital - "Q10864048", # ADM. DIV. 1 - "Q13220204", # ADM. DIV. 2 - "Q13220204", # ADM. DIV. 3 - "Q14757767", # ADM. DIV. 4 - "Q82794" # geographic region -] -inf_ = np.max(list(count.values())) -for type_ in to_increase: - count[type_] = inf_+1 - - -print("Dataframe contains",len(df),"entities") - - - -count_df = df.from_dict(count,orient="index").reset_index().sort_values(0,ascending=False) - -class_filtered = set(count_df.head(TOPN)["index"].values) -#Q15640612 #5 -#Q22927291 #6 - - - -df = df[df.instance_of.apply(lambda x: sum(True for i in x if i in class_filtered)>0)] - -def getMostFrequentClass(x): - idx = np.argsort([count[i] for i in x])[-1] - return x[idx] - -df["type"] = df.instance_of.apply(getMostFrequentClass) -df.to_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_filteredTop{0}class".format(TOPN)) - - - diff --git a/scripts/getEmbeddingGeonamesPlacenames.py b/scripts/getEmbeddingGeonamesPlacenames.py deleted file mode 100755 index 0493858b80ee67795aef73b89fd7bddaf6e24c84..0000000000000000000000000000000000000000 --- a/scripts/getEmbeddingGeonamesPlacenames.py +++ /dev/null @@ -1,59 +0,0 @@ -import fasttext -print("Load Model Fasttext FR") -model = fasttext.load_model("./data/fasttext_FR/wiki.fr.bin") -print("Model Loaded !") - -import pandas as pd -def read_geonames(file): - dtypes_dict = { - 0: int, # geonameid - 1: str, # name - 2: str, # asciiname - 3: str, # alternatenames - 4: float, # latitude - 5: float, # longitude - 6: str, # feature class - 7: str, # feature code - 8: str, # country code - 9: str, # cc2 - 10: str, # admin1 code - 11: str, # admin2 code - 12: str, # admin3 code - 13: str, # admin4 code - 14: int, # population - 15: str, # elevation - 16: int, # dem (digital elevation model) - 17: str, # timezone - 18: str # modification date yyyy-MM-dd - } - rename_cols = { - 0:"geonameid", # geonameid - 1:"name", # name - 2:"asciiname", # asciiname - 3:"alternatenames", # alternatenames - 4:"latitude", # latitude - 5:"longitude", # longitude - 6:"feature_class", # feature class - 7:"feature_class", # feature code - 8:"country_code", # country code - 9:"cc2", # cc2 - 10:"admin1_code", # admin1 code - 11:"admin2_code", # admin2 code - 12:"admin3_code", # admin3 code - 13:"admin4_code", # admin4 code - 14:"population", # population - 15:"elevation", # elevation - 16:"dem", # dem (digital elevation model) - 17:"timezone", # timezone - 18:"modification_date" # modification date yyyy-MM-dd - } - data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) - data.rename(columns=rename_cols,inplace=True) - return data -data = read_geonames("./data/geonamesData/FR.txt") -data= data.fillna("") -data = data[data.admin2_code == "50"] - -data["embedding"] = data["name"].apply(lambda x : model[x]) -print(data) -data.to_msgpack("geonamesFRWithEmbeddings.msg") \ No newline at end of file diff --git a/scripts/getWikidataTypesNames.py b/scripts/getWikidataTypesNames.py deleted file mode 100755 index 342fc469da7a287bdf997a8cb68c410b714e8f04..0000000000000000000000000000000000000000 --- a/scripts/getWikidataTypesNames.py +++ /dev/null @@ -1,42 +0,0 @@ -import subprocess,os,json -import numpy as np -import time - -import json -ids= json.load(open("classavailable.json")) - -from SPARQLWrapper import SPARQLWrapper,JSON - -def get_label(id_wikidata): - sparql = SPARQLWrapper("https://query.wikidata.org/sparql") - sparql.setQuery(""" - PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> - PREFIX wd: <http://www.wikidata.org/entity/> - select * - where { - wd:"""+id_wikidata+ """ rdfs:label ?label . - FILTER (langMatches( lang(?label), "EN" ) ) - } - LIMIT 1""" ) - sparql.setReturnFormat(JSON) - results = sparql.query().convert() - time.sleep(0.1) - try: - return results["results"]["bindings"][0]["label"]["value"] - except: - return "" - -from urllib.request import HTTPError -from tqdm import tqdm -t = 0 -dict_results = {} -pbar = tqdm(total=len(ids)) -while t<len(ids): - try: - dict_results[ids[t]]=get_label(ids[t]) - except HTTPError as e: - time.sleep(1) - continue - pbar.update(1) - t+=1 -pbar.close() \ No newline at end of file diff --git a/scripts/run_.sh b/scripts/run_.sh deleted file mode 100755 index a614a8bd9938a00d552ed4a530d151de7f49770c..0000000000000000000000000000000000000000 --- a/scripts/run_.sh +++ /dev/null @@ -1,7 +0,0 @@ -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 100 --walk-length 30 --num-walks 200 --word2vec-window-size 30 -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 200 --walk-length 30 --num-walks 200 --word2vec-window-size 30 -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 300 --walk-length 30 --num-walks 200 --word2vec-window-size 30 -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 400 --walk-length 30 --num-walks 200 --word2vec-window-size 30 -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 500 --walk-length 30 --num-walks 200 --word2vec-window-size 30 -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 600 --walk-length 30 --num-walks 200 --word2vec-window-size 30 -python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 700 --walk-length 30 --num-walks 200 --word2vec-window-size 30 \ No newline at end of file diff --git a/templates/cover.css b/templates/cover.css deleted file mode 100755 index 7c6d33cdd58d82b8936fd0209c691184883d5e67..0000000000000000000000000000000000000000 --- a/templates/cover.css +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Globals - */ - -/* Links */ -a, -a:focus, -a:hover { - color: #fff; -} - -/* Custom default button */ -.btn-secondary, -.btn-secondary:hover, -.btn-secondary:focus { - color: #333; - text-shadow: none; /* Prevent inheritance from `body` */ - background-color: #fff; - border: .05rem solid #fff; -} - - -/* - * Base structure - */ - -html, -body { - height: 100%; - background-color: #333; -} - -body { - display: -ms-flexbox; - display: flex; - color: #fff; - text-shadow: 0 .05rem .1rem rgba(0, 0, 0, .5); - box-shadow: inset 0 0 5rem rgba(0, 0, 0, .5); -} - -.cover-container { - max-width: 42em; -} - - -/* - * Header - */ -.masthead { - margin-bottom: 2rem; -} - -.masthead-brand { - margin-bottom: 0; -} - -.nav-masthead .nav-link { - padding: .25rem 0; - font-weight: 700; - color: rgba(255, 255, 255, .5); - background-color: transparent; - border-bottom: .25rem solid transparent; -} - -.nav-masthead .nav-link:hover, -.nav-masthead .nav-link:focus { - border-bottom-color: rgba(255, 255, 255, .25); -} - -.nav-masthead .nav-link + .nav-link { - margin-left: 1rem; -} - -.nav-masthead .active { - color: #fff; - border-bottom-color: #fff; -} - -@media (min-width: 48em) { - .masthead-brand { - float: left; - } - .nav-masthead { - float: right; - } -} - - -/* - * Cover - */ -.cover { - padding: 0 1.5rem; -} -.cover .btn-lg { - padding: .75rem 1.25rem; - font-weight: 700; -} - - -/* - * Footer - */ -.mastfoot { - color: rgba(255, 255, 255, .5); -} diff --git a/templates/skeleton.html b/templates/skeleton.html deleted file mode 100755 index 43fe21d207d3ebd53e955efdfc0ab9dedfa36081..0000000000000000000000000000000000000000 --- a/templates/skeleton.html +++ /dev/null @@ -1,88 +0,0 @@ -<!DOCTYPE html> -<html lang="en"> - -<head> - <meta charset="UTF-8"> - <meta name="viewport" content="width=auto, initial-scale=1.0"> - <meta http-equiv="X-UA-Compatible" content="ie=edge"> - <title>Geocoder Interface</title> - <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" - integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous"> - - <!-- Load Leaflet --> - <link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" - integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" - crossorigin="" /> - <script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" - integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" - crossorigin=""></script> -</head> - -<body> - <style> - body { - - } - - #mapid { - height: 400px; - width: 100%; - } - </style> - - <main class="container-fluid"> - <h1 style="text-align: center;color:white;text-shadow: 1px 1px 2px black;background-color: #999;">Geocoder Demo</h1> - <div id="mapid"></div> - <div class="container" style="background-color: white;padding: 5px;"> - <h2>Input</h2> - <form action="/" method="get"> - <div class="form-group"> - <label for="formGroupExampleInput">Toponym</label> - <input type="text" class="form-control" name="top" - placeholder="Paris"> - </div> - <div class="form-group"> - <label for="formGroupExampleInput2">Context Toponym</label> - <input type="text" class="form-control" name="c_top" - placeholder="Cherbourg"> - </div> - <button type="submit" class="btn btn-primary">Get Coords !</button> - </form> - </div> - </main> - - <!-- JS SCRIPTS --> - <script src="https://code.jquery.com/jquery-3.4.1.slim.min.js" - integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n" - crossorigin="anonymous"></script> - <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js" - integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" - crossorigin="anonymous"></script> - <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js" - integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6" - crossorigin="anonymous"></script> - - <script> - - // Initialize the map - // [50, -0.1] are the latitude and longitude - // 4 is the zoom - // mapid is the id of the div where the map will appear - var mymap = L - .map('mapid') - .setView([50, -0.1], 4); - - // Add a tile to the map = a background. Comes from OpenStreetmap - L.tileLayer( - 'http://tile.stamen.com/toner/{z}/{x}/{y}.png', { - attribution: 'Map data © <a href="https://www.openstreetmap.org/">OpenStreetMap</a>', - maxZoom: 6, - }).addTo(mymap); - - var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); - - - </script> -</body> - -</html> \ No newline at end of file diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py deleted file mode 100755 index 4748f3edf1813f2dcebe90f5febc68a04490127b..0000000000000000000000000000000000000000 --- a/train_test_split_cooccurrence_data.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse - -import pandas as pd -import geopandas as gpd - -import logging -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -from sklearn.model_selection import train_test_split -from shapely.geometry import Point - -from utils import Grid - -from tqdm import tqdm - -parser = argparse.ArgumentParser() -parser.add_argument("cooccurrence_file") - -args = parser.parse_args("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split()) - -# LOAD DATAgeopandas -COOC_FN = args.cooccurrence_file - - - -logging.info("Load Cooc DATA data...") -cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("") -cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -cooc_data = gpd.GeoDataFrame(cooc_data) -logging.info("Cooc data loaded!") - -#Â World Shape bounds -world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) -world["nn"] = 1 -dissolved = world.dissolve(by="nn").iloc[0].geometry - -#Creating Grid -logging.info("Initializing Grid (360,180)...") -g = Grid(*dissolved.bounds,[360,180]) -logging.info("Fit Data to the Grid...") -g.fit_data(cooc_data) -logging.info("Placing place into the grid...") -[g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))] - -#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME -logging.info("Associate a cell number to each place in the Geoname Dataframe") -def foo(g,id_): - for ix,cell in enumerate(g.cells): - if id_ in cell.list_object: - return ix - -cooc_data["cat"] = cooc_data.title.apply(lambda x:foo(g,x)) - -# TRAIN AND TEST SPLIT -logging.info("Split Between Train and Test") - -# Cell can be empty -i=0 -while 1: - if len(cooc_data[cooc_data.cat == i])> 1: - X_train,X_test = train_test_split(cooc_data[cooc_data.cat == i]) - break - i+=1 - -for i in range(i+1,len(g.cells)): - try: - x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i]) - X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) - except Exception as e: - print(e) #print("Error",len(filtered[filtered.cat == i])) - -del X_train["geometry"] -del X_train["nn"] -del X_train["cat"] -del X_test["cat"] -del X_test["geometry"] -del X_test["nn"] -# SAVING THE DATA -logging.info("Saving Output !") -X_train.to_csv(COOC_FN+"_train.csv") -X_test.to_csv(COOC_FN+"_test.csv") \ No newline at end of file diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py deleted file mode 100755 index ff87967ed111a34283b9ef6fd0623b9eb953e59b..0000000000000000000000000000000000000000 --- a/train_test_split_geonames.py +++ /dev/null @@ -1,92 +0,0 @@ -import argparse - -import numpy as np -import pandas as pd -import geopandas as gpd - -import logging -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -from sklearn.model_selection import train_test_split -from shapely.geometry import Point - -from utils import Grid -from helpers import read_geonames - -from tqdm import tqdm - -parser = argparse.ArgumentParser() -parser.add_argument("geoname_file") -parser.add_argument("--feature_classes",help="List of class",default="A P") - -args = parser.parse_args()#("data/geonamesData/FR.txt".split()) - -# LOAD DATAgeopandas -GEONAME_FN = args.geoname_file -FEATURE_CLASSES = args.feature_classes - - -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") -geoname_data["geometry"] = geoname_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -geoname_data = gpd.GeoDataFrame(geoname_data) -logging.info("Geonames data loaded!") - -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places - -#Â World Shape bounds -world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) -world["nn"] = 1 -dissolved = world.dissolve(by="nn").iloc[0].geometry - -#Creating Grid -logging.info("Initializing Grid (360,180)...") -g = Grid(*dissolved.bounds,[360,180]) -logging.info("Fit Data to the Grid...") -g.fit_data(filtered) -logging.info("Placing place into the grid...") -[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))] - -#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME -logging.info("Associate a cell number to each place in the Geoname Dataframe") -def foo(g,id_): - for ix,cell in enumerate(g.cells): - if id_ in cell.list_object: - return ix - -filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x)) - -# TRAIN AND TEST SPLIT -logging.info("Split Between Train and Test") - -# Cell can be empty -i=0 -while 1: - if len(filtered[filtered.cat == i])> 1: - X_train,X_test = train_test_split(filtered[filtered.cat == i]) - break - i+=1 - -for i in range(i+1,len(g.cells)): - try: - x_train,x_test = train_test_split(filtered[filtered.cat == i]) - X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) - except: - pass #print("Error",len(filtered[filtered.cat == i])) - - -del X_train["geometry"] -del X_train["nn"] -del X_train["cat"] -del X_test["cat"] -del X_test["geometry"] -del X_test["nn"] -# SAVING THE DATA -logging.info("Saving Output !") -X_train.to_csv(GEONAME_FN+"_train.csv") -X_test.to_csv(GEONAME_FN+"_test.csv") \ No newline at end of file