diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ddc1507dda5eebc7fbb67e9a3546f78022969b26 --- /dev/null +++ b/.gitignore @@ -0,0 +1,151 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + + +#### CUSTOM + +data/* +deprecated/* +*.ipynb_checkpoints +notebooks/* +outputs/* +temp/* +WikipediaExtract/* + +*.DS_Store +test_comb.sh + +.vscode/* +notes.md + +.idea/* +.vscode/* +other/* +test* \ No newline at end of file diff --git a/README.md b/README.md index 530b3d80c20df7912cbe3b5661ae793d9e74e714..3eee5842dba0dccf55dd541f5655c33ea92186ee 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,169 @@ -# Toponym Geocoding +# Work on Place-embedding -Use of Ngram representation and colocation of toponyms in geography and text for geocoding. \ No newline at end of file +This repo contains various approach around geographic place embedding, and more precisely on its use for geocoding. At this moment, we designed three approaches : + + * Use of geographic places Wikipedia pages to learn an embedding for toponyms + * Use Geonames place topology to produce an embedding using graph-embedding techniques + * Use toponym colocation (combination ?) based on spatial relatationships (inclusion, adjacency) for geocoding + +<hr> + +## Setup environnement + +- Python3.6+ +- Os free (all dependencies work on Windows !) + +It is strongly advised to used Anaconda in a windows environnement! + +### Install dependencies + + pip3 install -r requirements.txt + +For Anaconda users + + while read requirement; do conda install --yes $requirement; done < requirements.txt + +<hr> + +## First approach : Embedding using places Wikipedia pages + +<div style="text-align:center"> +<img src="documentation/imgs/first_approach.png"/> +<p>Figure 1 : First approach general workflow</p> +</div> + +In this first approach, the goal is to produce embedding for place name. In order to do this, we designed a neural network that takes : + +* **Input:** Text sequence (phrase) +* **Output** Latitute, Longitude, and the place type + +Input texts are selected using Wikidata to filter Wikipedia pages about geographic places. Then, the filtered pages are retrieved on the Wikipedia corpus file. For each pages, we got : + +* Title +* Introduction text +* Coordinates of the place (laatitude-Longitude) +* Place type (using a mapping between Wikidata and DBpedia Place subclasses) + +### Step 1: Parse Wikipedia data ! + +First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* + +Then, use the `gensim` parser (doc [here](https://radimrehurek.com/gensim/scripts/segment_wiki.html)). Use the following command : + + python3 -m gensim.scripts.segment_wiki -i -f <wikipedia_dump_file> -o <output> + +### Step 2: Select and Filter entity from Wikidata + +We use Wikidata to identify which Wikipedia pages concern a place. Simply, run the following command : + + python3 1_extractDataFromWikidata.py <Wikidata Dump (.gz)> <output_filename> + +### Step 3: Extract data from Wikipedia pages + +Using previous output, we extract text data from selected Wikipedia pages with the following command: + + python3 2_extractLearningDataset.py <wikipedia_filename (output from step 1)> <wikidata_extract(output from step2)> <output_filename> + +### Step 4 : Run Embedding extraction + +To learn extract the place embedding, use the `embeddings_lat_lon_type.py` + +#### Available Parameters + +| Parameter | Value (default) | +|------------------------|---------------------| +| --max_sequence_length | Maximum sequence length (15) | +| --embedding_dimension | Embedding vector size (100) | +| --batch_size | batch size used in the training (100) | +| --epochs | Number of epochs (100) | +| -v | Display the keras verbose | + +#### Output + +The different outputs (on for each neural network architecture) are put in the `outputs` directory : + +* outputs/Bi-GRU_100dim_20epoch_1000batch__coord.png : **coordinates accuracy plot** +* outputs/Bi-GRU_100dim_20epoch_1000batch__place_type.png : **place type accuracy plot** +* outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history** +* outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings** + +<hr> + +## 2nd Approach: Geonames place embedding + +From this point, we change our vantage point by focusing our model propositions by using heavily spatial/geographical data, in this context gazetteer. In this second approach, we propose to generate an embedding for places (not place's toponym) based on their topology. + +In order to do that, we use Geonames data to build a topology graph. This graph is generated based on intersection found between place buffer intersection. + +(image ici) + +Then, using topology network, we use node-embedding techniques to generate an embedding for each vertex (places). + +<div style="text-align:center"> +<img src="documentation/imgs/second_approach.png"/> +<p><strong>Figure 2</strong> : Second approach general workflow</p> +</div> + +### Generate the embedding + +First, download the Geonames dump : [here](https://download.geonames.org/export/dump/) + +*N.B.* We advise you to take only the data from one country ! Topology network can be really dense and large ! + + python3 geonames_embedding.py <geonames dump(*.txt)> + +### Available Parameters + +| Parameter | Description (default) | +|------------------------|-------------------------------------------------------------------| +| --nbcpu | Number of CPU used for during the learning phase | +| --vector-size | Embedding size | +| --walk-length | Generated walk length | +| --num-walks | Number of walks for each vertex (place) | +| --word2vec-window-size | Window-size used in Word2vec | +| --buffer-size | Buffer size used to detect adjacency relationships between places | +| -d | Integrate distances between places in the topology graph | +| --dist | Distance used if '-d' | + +### Output files + +Gensim word2vec format is saved in the execution directory. + +<hr> + +## Embedding : train using concatenation of close places + +<div style="text-align:center"> +<img src="documentation/imgs/third_approach.png"/> +<p><strong>Figure 3</strong> : Third approach general workflow</p> +</div> + + +### Prepare required data + + * download the Geonames data use to train the network [here](download.geonames.org/export/dump/) + * download the hierarchy data [here](http://download.geonames.org/export/dump/hierarchy.zip) + * unzip both file in the directory of your choice + * run the script `train_test_split_geonames.py <geoname_filename>` + +### Train the network + +The script `combination_embeddings.py` is the one responsible of the neural network training + +To train the network with default parameter use the following command : + + python3 combination_embeddings.py -a -i <geoname data filename> <hierarchy geonames data filename> + +### Available parameters + + +| Parameter | Description | +|----------------------|----------------------------------------------------------------------| +| -i,--inclusion | Use inclusion relationships to train the network | +| -a,--adjacency | Use adjacency relationships to train the network | +| -w,--wikipedia-coo | Use Wikipedia place co-occurrences to train the network | +| -n,--ngram-size | ngram size | +| -t,--tolerance-value | K-value in the computation of the accuracy@k | +| -e,--epochs | number of epochs | +| -d,--dimension | size of the ngram embeddings | +| --admin_code_1 | (Optional) If you wish to train the network on a specificate region | diff --git a/combination_embeddings.py b/combination_embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..c147f8064042bea90976c809d7ca42fe5012369e --- /dev/null +++ b/combination_embeddings.py @@ -0,0 +1,329 @@ +# Base module +import re +import os +import json + +# Structure +import pandas as pd +import numpy as np +import geopandas as gpd + +# DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM +from keras.models import Model +from keras import backend as K +import tensorflow as tf + +# Geometry +from shapely.geometry import Point + +# Custom module +from helpers import read_geonames +from utils import Grid +from utils import zero_one_encoding, NgramIndex,ConfigurationReader +from metrics import lat_accuracy,lon_accuracy + +# Logging +from tqdm import tqdm +import logging +from helpers import Chronometer + + +def parse_title_wiki(title_wiki): + """ + Parse Wikipedia title + + Parameters + ---------- + title_wiki : str + wikipedia title + + Returns + ------- + str + parsed wikipedia title + """ + return re.sub("\(.*\)","",title_wiki).strip().lower() + +def get_new_ids(cooc_data,id_first_value): + """ + Return new ids from cooccurrence data + + Parameters + ---------- + cooc_data : pd.DataFrame + cooccurrence da + id_first_value : int + id beginning value + + Returns + ------- + dict + new ids for each toponyms + """ + topo_id = {} + id_ = id_first_value + for title in cooc_data.title.values: + if not title in topo_id: + id_+=1 + topo_id[id_]=title + for interlinks in cooc_data.interlinks.values: + for interlink in interlinks.split("|"): + if not interlink in topo_id: + id_+=1 + topo_id[id_]=interlink + return topo_id + + + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) +chrono = Chronometer() + +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ + .parse_args()#("-n 4 -t 0.002 -e 20 -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) + +# Initialisee CONSTANTS +GEONAME_FN = args.geoname_input +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = 3 +WORDVEC_ITER = 50 + +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +# LOAD Geonames DATA +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") +hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") + +train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values +train_indices,test_indices = set(train_indices),set(test_indices) + +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places + +# IF REGION (ONLY FR for now !) +admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split() +region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1 +if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth: + filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() + +# REDUCE DATA STORED +filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD + +# Geometry operation +filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +filtered = gpd.GeoDataFrame(filtered) +filtered["i"]=1 +bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships + + +rel_store = [] + +if args.adjacency: + # RETRIEVE ADJACENCY REL + logging.info("Retrieve adjacency relationships ! ") + fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn) + if not os.path.exists(fn): + g = Grid(*bounds,[360,180]) + g.fit_data(filtered) + [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] + rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)]) + json.dump(rel_store,open(fn,'w')) + else: + logging.info("Open and load data from previous computation!") + rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))] + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) + +if args.inclusion: + # RETRIEVE INCLUSION RELATIONSHIPS + logging.info("Retrieve inclusion relationships ! ") + geonamesIDS = set(filtered.geonameid.values) + filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS)) + rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())) + logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) + +del filtered["geometry"] + +if args.wikipedia_cooc: + logging.info("Load Wikipedia Cooccurrence data and merge with geonames") + COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1] + cooc_data = pd.read_csv(COOC_FN,sep="\t") + cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) + cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) + id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max()) + wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} + cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) + filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) + + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + + logging.info("Merged with Geonames data !") + + # EXTRACT rel + logging.info("Extracting cooccurrence relationships") + cpt=0 + for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): + for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): + cpt+=1 + rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) + logging.info("Extract {0} cooccurrence relationships !".format(cpt)) + + +# STORE ID to name +geoname2name = dict(filtered["geonameid name".split()].values) + +# ENCODING NAME USING N-GRAM SPLITTING +logging.info("Encoding toponyms to ngram...") +index = NgramIndex(NGRAM_SIZE) +filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available +if args.wikipedia_cooc: + [index.split_and_add(k) for k in wikipediatitle_id] +filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding +max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length +if args.wikipedia_cooc: + extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()} + +index.max_len = int(max_len) # For Index state dump + +filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len +if args.wikipedia_cooc: + extension = {k:index.complete(v,max_len) for k,v in extension.items()} +geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association + +if args.wikipedia_cooc: + geoname2encodedname.update(extension) + + +logging.info("Done !") + +#CLEAR RAM +del hierarchy_data +del geoname_data + +# Encode each geonames entry coordinates +filtered["cell_vec"]=filtered.apply( + lambda x : zero_one_encoding(x.longitude,x.latitude), + axis=1 + ) +geoname_vec = dict(filtered["geonameid cell_vec".split()].values) +# CLEAR RAM +del filtered + + +embedding_dim = 256 +num_words = len(index.index_ngram) # necessary for the embedding matrix + +logging.info("Preparing Input and Output data...") + +X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] +X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] + +cpt=0 +for couple in rel_store: + geonameId_1,geonameId_2 = couple[0],couple[1] + if not geonameId_1 in geoname2encodedname: + cpt+=1 + continue + top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] + if geonameId_1 in train_indices: #and geonameId_2 in train_indices: + + X_1_train.append(top1) + X_2_train.append(top2) + + y_lon_train.append(geoname_vec[geonameId_1][0]) + y_lat_train.append(geoname_vec[geonameId_1][1]) + + else: + X_1_test.append(top1) + X_2_test.append(top2) + + y_lon_test.append(geoname_vec[geonameId_1][0]) + y_lat_test.append(geoname_vec[geonameId_1][1]) + +# NUMPYZE inputs and output lists +X_1_train = np.array(X_1_train) +X_2_train = np.array(X_2_train) +y_lat_train = np.array(y_lat_train) +y_lon_train = np.array(y_lon_train) + +X_1_test = np.array(X_1_test) +X_2_test = np.array(X_2_test) +y_lat_test = np.array(y_lat_test) +y_lon_test = np.array(y_lon_test) + +logging.info("Data prepared !") + + +# OUTPUT FN BASE +name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) +if args.adjacency: + name += "_A" +if args.inclusion: + name += "_I" +if args.wikipedia_cooc: + name += "_C" + +index.save("outputs/"+name+"_index") + + +# NGRAM EMBDEDDING +logging.info("Generating N-GRAM Embedding...") +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=WORDVEC_ITER) +logging.info("Embedding generated !") + +# DEEP MODEL +name = "LSTM_"+ name +input_1 = Input(shape=(max_len,)) +input_2 = Input(shape=(max_len,)) + +embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) + +x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) +x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +#x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +#x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +#x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +#x2 = Dropout(0.3)(x2) + +output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) +output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + +model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + +model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) +history = model.fit(x=[X_1_train,X_2_train], + y=[y_lon_train,y_lat_train], + verbose=True, batch_size=100, + epochs=EPOCHS, + validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv("outputs/{0}.csv".format(name)) + +model.save("outputs/"+name+".h5") + diff --git a/metrics.py b/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..e82c54809aa2a6bece60cd74875140d3719c1ea6 --- /dev/null +++ b/metrics.py @@ -0,0 +1,37 @@ +import tensorflow as tf + +def lat_accuracy(LAT_TOL =1/180.): + def accuracy_at_k_lat(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64) + return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) + return accuracy_at_k_lat + +def lon_accuracy(LON_TOL=1/360.): + def accuracy_at_k_lon(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64) + return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64) + return accuracy_at_k_lon \ No newline at end of file diff --git a/parser_config/embeddings_lat_lon.json b/parser_config/embeddings_lat_lon.json new file mode 100644 index 0000000000000000000000000000000000000000..1a0c774c47b9a6294bf3f54936c79773fc7027a9 --- /dev/null +++ b/parser_config/embeddings_lat_lon.json @@ -0,0 +1,12 @@ +{ + "description": "Toponym Combination", + "args": [ + { "short": "input", "help": "Corpus used to learn the embeddings" }, + { "short": "-g", "long": "--glove__dir", "default": "data/glove" }, + {"long": "--max_sequence_length", "type":"int","default":15}, + {"long": "--max_num_words", "type":"int","default":400000}, + {"long": "--embedding_dimension", "type":"int","default":100}, + {"long": "--batch_size", "type":"int","default":100}, + { "short": "-e", "long": "--epochs", "type": "int", "default": 100 } + ] +} \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json new file mode 100644 index 0000000000000000000000000000000000000000..a2fd9f120b3e791f17948eba7d02b8e2a34116e3 --- /dev/null +++ b/parser_config/toponym_combination_embedding.json @@ -0,0 +1,17 @@ +{ + "description": "Toponym Combination", + "args": [ + { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, + { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." }, + { "short": "-v", "long": "--verbose", "action": "store_true" }, + { "short": "-i", "long": "--inclusion", "action": "store_true" }, + { "short": "-a", "long": "--adjacency", "action": "store_true" }, + { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, + {"long": "--adjacency-iteration", "type":"int","default":1}, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, + { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, + { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, + { "long": "--admin_code_1", "default": "None" } + ] +} \ No newline at end of file diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py new file mode 100644 index 0000000000000000000000000000000000000000..5dcdb7f81a8fbc28826131b5d1680f3647bf6e68 --- /dev/null +++ b/predict_toponym_coordinates.py @@ -0,0 +1,80 @@ +from keras.models import load_model +import tensorflow as tf +import keras.backend as K +from utils import NgramIndex + +from tensorflow.python.keras.backend import set_session +from tensorflow.python.keras.models import load_model + +sess = None +graph = None + +from metrics import lat_accuracy,lon_accuracy + +class Geocoder(object): + """ + >>>geocoder = Geocoder("LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","index_4gram_FR_backup.txt") + >>>lon,lat = geocoder.get_coord("Paris","New-York") + >>>lon,lat = geocoder.wgs_coord(lon,lat) + >>>geocoder.plot_coord("Paris,New-York",lat,lon) + + if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()` + """ + def __init__(self,keras_model_fn,ngram_index_file): + global sess + global graph + sess = tf.compat.v1.Session() + graph = tf.compat.v1.get_default_graph() + set_session(sess) + self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy}) + self.ngram_encoder = NgramIndex.load(ngram_index_file) + + def get_coord(self,toponym,context_toponym): + global sess + global graph + p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) + c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len) + with sess.as_default(): + with graph.as_default(): + lon,lat = self.keras_model.predict([[p],[c]]) + return lon[0][0],lat[0][0] + + def wgs_coord(self,lon,lat): + return ((lon*360)-180),((lat*180)-90) + + def plot_coord(self,toponym,lat,lon,interactive_map=False,**kwargs): + if interactive_map: + import folium + import tempfile + import webbrowser + fp = tempfile.NamedTemporaryFile(delete=False) + m = folium.Map() + folium.Marker([lat, lon], popup=toponym).add_to(m) + m.save(fp.name) + webbrowser.open('file://' + fp.name) + else: + import matplotlib.pyplot as plt + import geopandas + fig, ax = plt.subplots(1,**kwargs) + world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + world.plot(color='white', edgecolor='black',ax=ax) + ax.plot(lon,lat,marker='o', color='red', markersize=5) + plt.show() + +if __name__ == "__main__": + from flask import Flask, escape, request, render_template + + app = Flask(__name__) + + + geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt") + + @app.route('/',methods=["GET"]) + def display(): + toponym = request.args.get("top", "Paris") + c_toponym = request.args.get("c_top", "Cherbourg") + lon,lat = geocoder.get_coord(toponym,c_toponym) + lon,lat = geocoder.wgs_coord(lon,lat) + return render_template("skeleton.html",lat=lat,lon=lon) + + app.run(host='0.0.0.0') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4eb9e43ed0fb7aefddd5bdfe87ddf049a489394c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +#pyroutelib3 +node2vec +#osrm +geopandas +pandas +numpy +tqdm +networkx +matplotlib +joblib +gensim +sklearn +tensorflow +keras +ngram +shapely +sqlitedict +nltk +folium +flask +numba diff --git a/templates/cover.css b/templates/cover.css new file mode 100644 index 0000000000000000000000000000000000000000..7c6d33cdd58d82b8936fd0209c691184883d5e67 --- /dev/null +++ b/templates/cover.css @@ -0,0 +1,106 @@ +/* + * Globals + */ + +/* Links */ +a, +a:focus, +a:hover { + color: #fff; +} + +/* Custom default button */ +.btn-secondary, +.btn-secondary:hover, +.btn-secondary:focus { + color: #333; + text-shadow: none; /* Prevent inheritance from `body` */ + background-color: #fff; + border: .05rem solid #fff; +} + + +/* + * Base structure + */ + +html, +body { + height: 100%; + background-color: #333; +} + +body { + display: -ms-flexbox; + display: flex; + color: #fff; + text-shadow: 0 .05rem .1rem rgba(0, 0, 0, .5); + box-shadow: inset 0 0 5rem rgba(0, 0, 0, .5); +} + +.cover-container { + max-width: 42em; +} + + +/* + * Header + */ +.masthead { + margin-bottom: 2rem; +} + +.masthead-brand { + margin-bottom: 0; +} + +.nav-masthead .nav-link { + padding: .25rem 0; + font-weight: 700; + color: rgba(255, 255, 255, .5); + background-color: transparent; + border-bottom: .25rem solid transparent; +} + +.nav-masthead .nav-link:hover, +.nav-masthead .nav-link:focus { + border-bottom-color: rgba(255, 255, 255, .25); +} + +.nav-masthead .nav-link + .nav-link { + margin-left: 1rem; +} + +.nav-masthead .active { + color: #fff; + border-bottom-color: #fff; +} + +@media (min-width: 48em) { + .masthead-brand { + float: left; + } + .nav-masthead { + float: right; + } +} + + +/* + * Cover + */ +.cover { + padding: 0 1.5rem; +} +.cover .btn-lg { + padding: .75rem 1.25rem; + font-weight: 700; +} + + +/* + * Footer + */ +.mastfoot { + color: rgba(255, 255, 255, .5); +} diff --git a/templates/skeleton.html b/templates/skeleton.html new file mode 100644 index 0000000000000000000000000000000000000000..43fe21d207d3ebd53e955efdfc0ab9dedfa36081 --- /dev/null +++ b/templates/skeleton.html @@ -0,0 +1,88 @@ +<!DOCTYPE html> +<html lang="en"> + +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=auto, initial-scale=1.0"> + <meta http-equiv="X-UA-Compatible" content="ie=edge"> + <title>Geocoder Interface</title> + <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" + integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous"> + + <!-- Load Leaflet --> + <link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" + integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" + crossorigin="" /> + <script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" + integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" + crossorigin=""></script> +</head> + +<body> + <style> + body { + + } + + #mapid { + height: 400px; + width: 100%; + } + </style> + + <main class="container-fluid"> + <h1 style="text-align: center;color:white;text-shadow: 1px 1px 2px black;background-color: #999;">Geocoder Demo</h1> + <div id="mapid"></div> + <div class="container" style="background-color: white;padding: 5px;"> + <h2>Input</h2> + <form action="/" method="get"> + <div class="form-group"> + <label for="formGroupExampleInput">Toponym</label> + <input type="text" class="form-control" name="top" + placeholder="Paris"> + </div> + <div class="form-group"> + <label for="formGroupExampleInput2">Context Toponym</label> + <input type="text" class="form-control" name="c_top" + placeholder="Cherbourg"> + </div> + <button type="submit" class="btn btn-primary">Get Coords !</button> + </form> + </div> + </main> + + <!-- JS SCRIPTS --> + <script src="https://code.jquery.com/jquery-3.4.1.slim.min.js" + integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n" + crossorigin="anonymous"></script> + <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js" + integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" + crossorigin="anonymous"></script> + <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js" + integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6" + crossorigin="anonymous"></script> + + <script> + + // Initialize the map + // [50, -0.1] are the latitude and longitude + // 4 is the zoom + // mapid is the id of the div where the map will appear + var mymap = L + .map('mapid') + .setView([50, -0.1], 4); + + // Add a tile to the map = a background. Comes from OpenStreetmap + L.tileLayer( + 'http://tile.stamen.com/toner/{z}/{x}/{y}.png', { + attribution: 'Map data © <a href="https://www.openstreetmap.org/">OpenStreetMap</a>', + maxZoom: 6, + }).addTo(mymap); + + var marker = L.marker([{{lat}}, {{lon}}]).addTo(mymap); + + + </script> +</body> + +</html> \ No newline at end of file diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py new file mode 100644 index 0000000000000000000000000000000000000000..4748f3edf1813f2dcebe90f5febc68a04490127b --- /dev/null +++ b/train_test_split_cooccurrence_data.py @@ -0,0 +1,85 @@ +import argparse + +import pandas as pd +import geopandas as gpd + +import logging +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +from sklearn.model_selection import train_test_split +from shapely.geometry import Point + +from utils import Grid + +from tqdm import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("cooccurrence_file") + +args = parser.parse_args("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split()) + +# LOAD DATAgeopandas +COOC_FN = args.cooccurrence_file + + + +logging.info("Load Cooc DATA data...") +cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("") +cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +cooc_data = gpd.GeoDataFrame(cooc_data) +logging.info("Cooc data loaded!") + +# World Shape bounds +world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) +world["nn"] = 1 +dissolved = world.dissolve(by="nn").iloc[0].geometry + +#Creating Grid +logging.info("Initializing Grid (360,180)...") +g = Grid(*dissolved.bounds,[360,180]) +logging.info("Fit Data to the Grid...") +g.fit_data(cooc_data) +logging.info("Placing place into the grid...") +[g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))] + +#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME +logging.info("Associate a cell number to each place in the Geoname Dataframe") +def foo(g,id_): + for ix,cell in enumerate(g.cells): + if id_ in cell.list_object: + return ix + +cooc_data["cat"] = cooc_data.title.apply(lambda x:foo(g,x)) + +# TRAIN AND TEST SPLIT +logging.info("Split Between Train and Test") + +# Cell can be empty +i=0 +while 1: + if len(cooc_data[cooc_data.cat == i])> 1: + X_train,X_test = train_test_split(cooc_data[cooc_data.cat == i]) + break + i+=1 + +for i in range(i+1,len(g.cells)): + try: + x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i]) + X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) + except Exception as e: + print(e) #print("Error",len(filtered[filtered.cat == i])) + +del X_train["geometry"] +del X_train["nn"] +del X_train["cat"] +del X_test["cat"] +del X_test["geometry"] +del X_test["nn"] +# SAVING THE DATA +logging.info("Saving Output !") +X_train.to_csv(COOC_FN+"_train.csv") +X_test.to_csv(COOC_FN+"_test.csv") \ No newline at end of file diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py new file mode 100644 index 0000000000000000000000000000000000000000..ff87967ed111a34283b9ef6fd0623b9eb953e59b --- /dev/null +++ b/train_test_split_geonames.py @@ -0,0 +1,92 @@ +import argparse + +import numpy as np +import pandas as pd +import geopandas as gpd + +import logging +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +from sklearn.model_selection import train_test_split +from shapely.geometry import Point + +from utils import Grid +from helpers import read_geonames + +from tqdm import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("geoname_file") +parser.add_argument("--feature_classes",help="List of class",default="A P") + +args = parser.parse_args()#("data/geonamesData/FR.txt".split()) + +# LOAD DATAgeopandas +GEONAME_FN = args.geoname_file +FEATURE_CLASSES = args.feature_classes + + +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") +geoname_data["geometry"] = geoname_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +geoname_data = gpd.GeoDataFrame(geoname_data) +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places + +# World Shape bounds +world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) +world["nn"] = 1 +dissolved = world.dissolve(by="nn").iloc[0].geometry + +#Creating Grid +logging.info("Initializing Grid (360,180)...") +g = Grid(*dissolved.bounds,[360,180]) +logging.info("Fit Data to the Grid...") +g.fit_data(filtered) +logging.info("Placing place into the grid...") +[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))] + +#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME +logging.info("Associate a cell number to each place in the Geoname Dataframe") +def foo(g,id_): + for ix,cell in enumerate(g.cells): + if id_ in cell.list_object: + return ix + +filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x)) + +# TRAIN AND TEST SPLIT +logging.info("Split Between Train and Test") + +# Cell can be empty +i=0 +while 1: + if len(filtered[filtered.cat == i])> 1: + X_train,X_test = train_test_split(filtered[filtered.cat == i]) + break + i+=1 + +for i in range(i+1,len(g.cells)): + try: + x_train,x_test = train_test_split(filtered[filtered.cat == i]) + X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) + except: + pass #print("Error",len(filtered[filtered.cat == i])) + + +del X_train["geometry"] +del X_train["nn"] +del X_train["cat"] +del X_test["cat"] +del X_test["geometry"] +del X_test["nn"] +# SAVING THE DATA +logging.info("Saving Output !") +X_train.to_csv(GEONAME_FN+"_train.csv") +X_test.to_csv(GEONAME_FN+"_test.csv") \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..db250b77474f8e1a135a373b76461dad485f88c1 --- /dev/null +++ b/utils.py @@ -0,0 +1,614 @@ +# Basic import +import math +import argparse +import os +import json + +# Data Structure +import numpy as np +import geopandas as gpd +from shapely.geometry import Point,box + +# NLP +from nltk.tokenize import word_tokenize +from ngram import NGram + +# Machine learning +from gensim.models import Word2Vec + +# Visualisation and parallelisation +from tqdm import tqdm + + +class TokenizerCustom(): + def __init__(self,vocab): + self.word_index = {vocab[i]:i for i in range(len(vocab))} + self.index_word = {i:vocab[i] for i in range(len(vocab))} + self.N = len(self.index_word) + def texts_to_sequences(self,listText): + seqs = [] + for text in listText: + seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index]) + return seqs + + +class CoordinatesEncoder: + """ + Will be replaced by Grid in grid2.py + """ + def __init__(self, cell_size_lat=0.5, cell_size_lon=0.5): + self.min_lon = -180 + self.max_lon = -(self.min_lon) #  Symetric + self.min_lat = -90 + self.max_lat = -(self.min_lat) # Symetric + + self.ecart_lat = self.max_lat - self.min_lat + self.ecart_lon = self.max_lon - self.min_lon + + self.cell_size_lat = cell_size_lat + self.cell_size_lon = cell_size_lon + + self.unit_size_lat = self.ecart_lat / self.cell_size_lat + self.unit_size_lon = self.ecart_lon / self.cell_size_lon + + def encode(self, lat, lon): + return ( + math.floor(((lat + self.max_lat) / self.ecart_lat) * self.unit_size_lat), + math.floor(((lon + self.max_lon) / self.ecart_lon) * (self.unit_size_lon)) + ) + + def number_lat_cell(self): + return int(self.unit_size_lat) + + def number_lon_cell(self): + return int(self.unit_size_lon) + + def oneDimensionOutputSize(self): + return self.number_lat_cell() * self.number_lon_cell() + + def vector(self, lat, lon): + lat_v, lon_v = np.zeros(self.number_lat_cell()), np.zeros(self.number_lon_cell()) + new_coords = self.encode(lat, lon) + lat_v[int(new_coords[0])] = 1 + lon_v[int(new_coords[1])] = 1 + return lat_v, lon_v + + def vector_flatten(self, lat, lon): + vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible + new_coords = self.encode(lat, lon) + pos = self.number_lat_cell() * (new_coords[0]) + new_coords[1] + vec[pos] = 1 # lon * lon size + return vec + + +class NgramIndex(): + """ + Class used for encoding words in ngram representation + """ + def __init__(self,n): + """ + Constructor + + Parameters + ---------- + n : int + ngram size + """ + self.ngram_gen = NGram(N=n) + + self.size = n + self.ngram_index = {"":0} + self.index_ngram = {0:""} + self.cpt = 0 + self.max_len = 0 + + def split_and_add(self,word): + """ + Split word in multiple ngram and add each one of them to the index + + Parameters + ---------- + word : str + a word + """ + ngrams = word.lower().replace(" ","$") + ngrams = list(self.ngram_gen.split(ngrams)) + [self.add(ngram) for ngram in ngrams] + + def add(self,ngram): + """ + Add a ngram to the index + + Parameters + ---------- + ngram : str + ngram + """ + if not ngram in self.ngram_index: + self.cpt+=1 + self.ngram_index[ngram]=self.cpt + self.index_ngram[self.cpt]=ngram + + def encode(self,word): + """ + Return a ngram representation of a word + + Parameters + ---------- + word : str + a word + + Returns + ------- + list of int + listfrom shapely.geometry import Point,box + of ngram index + """ + ngrams = word.lower().replace(" ","$") + ngrams = list(self.ngram_gen.split(ngrams)) + [self.add(ng) for ng in ngrams if not ng in self.ngram_index] + return [self.ngram_index[ng] for ng in ngrams] + + def complete(self,ngram_encoding,MAX_LEN,filling_item=0): + """ + Complete a ngram encoded version of word with void ngram. It's necessary for neural network. + + Parameters + ---------- + ngram_encoding : list of int + first encoding of a word + MAX_LEN : int + desired length of the encoding + filling_item : int, optional + ngram index you wish to use, by default 0 + + Returns + ------- + list of int + list of ngram index + """ + assert len(ngram_encoding) <= MAX_LEN + diff = MAX_LEN - len(ngram_encoding) + ngram_encoding.extend([filling_item]*diff) + return ngram_encoding + + def get_embedding_layer(self,texts,dim=100,**kwargs): + """ + Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model. + + Parameters + ---------- + texts : list of [list of int] + list of encoded word + dim : int, optional + embedding dimension, by default 100 + + Returns + ------- + np.array + embedding matrix + """ + model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) + N = len(self.ngram_index) + embedding_matrix = np.zeros((N,dim)) + for i in range(N): + embedding_matrix[i] = model.wv[str(i)] + return embedding_matrix + + def save(self,fn): + """ + + Save the NgramIndex + + Parameters + ---------- + fn : str + output filename + """ + data = { + "ngram_size": self.size, + "ngram_index": self.ngram_index, + "cpt_state": self.cpt, + "max_len_state": self.max_len + } + json.dump(data,open(fn,'w')) + + @staticmethod + def load(fn): + """ + + Load a NgramIndex state from a file. + + Parameters + ---------- + fn : str + input filename + + Returns + ------- + NgramIndex + ngram index + + Raises + ------ + KeyError + raised if a required field does not appear in the input file + """ + try: + data = json.load(open(fn)) + except json.JSONDecodeError: + print("Data file must be a JSON") + for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: + if not key in data: + raise KeyError("{0} field cannot be found in given file".format(key)) + new_obj = NgramIndex(data["ngram_size"]) + new_obj.ngram_index = data["ngram_index"] + new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} + new_obj.cpt = data["cpt_state"] + new_obj.max_len = data["max_len_state"] + return new_obj + + +def zero_one_encoding(long,lat): + """ + Encode coordinates (WGS84) between 0 and 1 + + Parameters + ---------- + long : float + longitude value + lat : float + latitude value + + Returns + ------- + float,float + longitude, latitude + """ + return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) + +def _split(lst,n,complete_chunk_value): + """ + Split a list into chunk of n-size. + + Parameters + ---------- + lst : list + input list + n : int + chunk size + complete_chunk_value : object + if last chunk size not equal to n, this value is used to complete it + + Returns + ------- + list + chunked list + """ + chunks = [lst[i:i + n] for i in range(0, len(lst), n)] + if not chunks:return chunks + if len(chunks[-1]) != n: + chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) + return np.array(chunks) + +def generate_couple(object_list): + """ + Return a randomly selected couple from an object list. + + Parameters + ---------- + object_list : list + object list + + Returns + ------- + list + list of coupled object + """ + couples = [] + lst = np.arange(len(object_list)) + for _ in range(len(object_list)): + if len(lst) == 1: + break + idx = np.random.choice(np.arange(len(lst))) + idx2 = np.random.choice(np.arange(len(lst))) + while idx2 == idx: + idx2 = np.random.choice(np.arange(len(lst))) + couples.append([object_list[lst[idx]],object_list[lst[idx2]]]) + lst = np.delete(lst,idx) + return couples + +def _hash_couple(o1,o2): + """ + Return an hash for two object ids. + + Parameters + ---------- + o1 : str or int + id of the first objeeect + o2 : str of int + id of the second object + + Returns + ------- + str + hash + """ + return "|".join(map(str,sorted([int(o1),int(o2)]))) + + + +### GEO ADJAC BEGIN +class Cell(object): + """ + A cell is box placed in geeographical space. + """ + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,x,y): + """ + Constructor + + Parameters + ---------- + object : [type] + [description] + upperleft_x : float + upperleft longitude + upperleft_y : float + upperleft latitude + bottomright_x : float + bottom right longitude + bottomright_y : float + bottom right latitude + x : int + cell x coordinates in the grid + y : int + cell y coordinates in the grid + """ + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) + self.list_object={} # {id:Point(coord)} + + self.x,self.y = x, y + + def contains(self,lat,lon): + """ + Return true if the cell contains a point at given coordinates + + Parameters + ---------- + lat : float + latitude + lon : float + longitude + + Returns + ------- + bool + true if contains + """ + x,y = lon,lat + if x < self.upperleft_x or x > self.bottomright_x: + return False + if y < self.upperleft_y or y > self.bottomright_y: + return False + return True + + def add_object(self,id_,lat,lon): + """ + Connect an object to the cell + + Parameters + ---------- + id_ : int + id + lat : float + latitude + lon : float + longitude + """ + self.list_object[id_] = Point(lon,lat) + + def __repr__(self): + return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) + +class Grid(object): + """ + Define a grid + + """ + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]): + """ + Constructor + + Parameters + ---------- + upperleft_x : float + upperleft longitude + upperleft_y : float + upperleft latitude + bottomright_x : float + bottom right longitude + bottomright_y : float + bottom right latitude + cell_sub_div_index : list, optional + number of division in both latitude and longitude axis (longitude first), by default [100,50] + """ + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + + self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0] + self.y_r = abs(self.upperleft_y - self.bottomright_y )/cell_sub_div_index[1] + + self.c_x_r = self.x_r/cell_sub_div_index[0] # Redivide + self.c_y_r = self.y_r/cell_sub_div_index[1] + + self.cells = [] + self.inter_cells = [] + for i in range(cell_sub_div_index[1]): + self.cells.append([]) + for j in range(cell_sub_div_index[0]): + self.cells[-1].append(Cell( + self.upperleft_x+j*self.x_r, + self.upperleft_y+i*self.y_r, + self.upperleft_x+((j+1)*self.x_r), + self.upperleft_y+((i+1)*self.y_r), + j,i) + ) + dec_y = 0 + for i in range(cell_sub_div_index[1]): + self.inter_cells.append([]) + dec_x = 0 + for j in range(cell_sub_div_index[0]): + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP + self.upperleft_y+(i*self.y_r)-dec_y, + self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r), + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) + ) + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER + self.upperleft_y+(i*self.y_r)-self.c_y_r, + self.upperleft_x+((j+1)*self.x_r)+self.c_x_r, + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) + ) + self.inter_cells[-1].append(Cell( + self.upperleft_x+(j*self.x_r)+dec_x, # CENTER + self.upperleft_y+(i*self.y_r)-self.c_y_r, + self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT + self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, + j,i) + ) + dec_x = self.c_x_r + dec_y = self.c_y_r + + def fit_data(self,data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))): + """ + + To avoid unnecessary check when connecting an entity to one or multiple cells, we + filter cells that does not appears in our geographic context (here countries surface). + + Parameters + ---------- + data : GeoDataFrame + geographic context + """ + world = data + world["nn"] = 1 + dissolved = world.dissolve(by="nn").iloc[0].geometry + new_cells= [] + new_inter_cells=[] + for i in tqdm(range(len(self.cells))): + for j in range(len(self.cells[i])): + if dissolved.intersects(self.cells[i][j].box_): + new_cells.append(self.cells[i][j]) + new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3]) + + self.cells=new_cells + self.inter_cells = new_inter_cells + + + def __add__(self,a): + """ + Add an object to the grid + + Parameters + ---------- + a : tuple + (id, latitude, longitude) + """ + for c1 in range(len(self.cells)): + if self.cells[c1].contains(a[1],a[2]): + self.cells[c1].add_object(*a) + + for c1 in range(len(self.inter_cells)): + if self.inter_cells[c1].contains(a[1],a[2]): + self.inter_cells[c1].add_object(*a) + + def get_adjacent_relationships(self,random_iteration=10): + """ + Return a list of adjacent relationships founds in each cell. + + Parameters + ---------- + random_iteration : int, optional + number of iteration for random selection of adjacency relationships, by default 10 + + Returns + ------- + list + adjacency relationships + """ + relationships = set([]) + for c1 in tqdm(range(len(self.cells))): + for i in range(random_iteration): + for t in generate_couple(list(self.cells[c1].list_object.keys())): + relationships.add(_hash_couple(t[0],t[1])) + + for c1 in tqdm(range(len(self.inter_cells))): + for i in range(random_iteration): + for t in generate_couple(list(self.inter_cells[c1].list_object.keys())): + relationships.add(_hash_couple(t[0],t[1])) + return relationships + + +### GEO ADJAC END + +class ConfigurationReader(object): + def __init__(self,configuration_file): + if not os.path.exists(configuration_file): + raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file)) + + self.configuration = json.load(open(configuration_file)) + + self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"]) + self.parser = argparse.ArgumentParser(description=self.__argparser_desc) + + self.parse_conf() + + def parse_conf(self): + if not "args" in self.configuration: + raise argparse.ArgumentError("","No args given in the configuration file") + + for dict_args in self.configuration["args"]: + if not isinstance(dict_args,dict): + raise ValueError("Args must be dictionnary") + + short_command = dict_args.get("short",None) + long_command = dict_args.get("long",None) + + if not short_command and not long_command: + raise ValueError("No command name was given !") + + add_func_dict_= {} + if "help" in dict_args: + add_func_dict_["help"]= dict_args["help"] + if "default" in dict_args: + add_func_dict_["default"]= dict_args["default"] + if "action" in dict_args: + add_func_dict_["action"]= dict_args["action"] + if "type" in dict_args: + add_func_dict_["type"]= eval(dict_args["type"]) + if "choices" in dict_args: + add_func_dict_["choices"]= dict_args["choices"] + + if not (short_command and long_command): + command = (short_command if not long_command else long_command) + self.parser.add_argument(command,**add_func_dict_) + + elif long_command and short_command: + self.parser.add_argument(short_command,long_command,**add_func_dict_) + + def parse_args(self,input_=None): + if not input_: + return self.parser.parse_args() + return self.parser.parse_args(input_) + + + +if __name__ == "__main__": + + index = NgramIndex(3) + index.split_and_add("J'aime le paté") + encoding = index.encode("xxxyyyy") + index.complete(encoding,10) \ No newline at end of file