Skip to content
Snippets Groups Projects
Commit 637949fa authored by Jacques Fize's avatar Jacques Fize
Browse files

Add train test split on geographic criteria + Ngram Index state can be...

Add train test split on geographic criteria + Ngram Index state can be saved+add doc to utils class and func+add multilinguage possibility in the Wikidata extraction +Update README and requirements
parent b77a89a5
No related branches found
No related tags found
No related merge requests found
...@@ -143,3 +143,4 @@ WikipediaExtract/* ...@@ -143,3 +143,4 @@ WikipediaExtract/*
test_comb.sh test_comb.sh
.vscode/* .vscode/*
notes.md
\ No newline at end of file
...@@ -34,17 +34,20 @@ def job(line): ...@@ -34,17 +34,20 @@ def job(line):
try: try:
data = json.loads(line.strip(",\n")) data = json.loads(line.strip(",\n"))
if "sitelinks" in data and "claims" in data: if "sitelinks" in data and "claims" in data:
if "enwiki" in data["sitelinks"]: if "enwiki" in data["sitelinks"] or "frwiki" in data["sitelinks"]:
id_ = data["id"] page_available = [i for i in ["en","fr"] if i+"wiki" in data["sitelinks"]]
coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"] for site in page_available:
title = data["sitelinks"]["enwiki"]["title"] site = 'en' if 'enwiki' in data["sitelinks"] else 'fr'
url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_")) id_ = data["id"]
lat = coords_data["latitude"] coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]
lon = coords_data["longitude"] title = data["sitelinks"]["{0}wiki".format(site)]["title"]
classes_ = "" url = "https://{1}.wikipedia.org/wiki/{0}".format(title.replace(" ","_"),site)
for claimP31 in data["claims"]["P31"]: lat = coords_data["latitude"]
classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"]) lon = coords_data["longitude"]
output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_"))) classes_ = ""
for claimP31 in data["claims"]["P31"]:
classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"])
output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_")))
except Exception: # First Line is "['" and last line is "]'" except Exception: # First Line is "['" and last line is "]'"
pass pass
......
...@@ -88,19 +88,32 @@ Gensim word2vec format is saved in the execution directory. ...@@ -88,19 +88,32 @@ Gensim word2vec format is saved in the execution directory.
## Embedding : train using concatenation of close places ## Embedding : train using concatenation of close places
### Prepare required data
* download the Geonames data use to train the network [here](download.geonames.org/export/dump/)
* download the hierarchy data [here](http://download.geonames.org/export/dump/hierarchy.zip)
* unzip both file in the directory of your choice
* run the script `train_test_split_geonames.py <geoname_filename>`
Toponym Combination ### Train the network
positional arguments: The script `combination_embeddings.py` is the one responsible of the neural network training
geoname_input Filepath of the Geonames file you want to use.
geoname_hierachy_input
Filepath of the Geonames file you want to use.
optional arguments: To train the network with default parameter use the following command :
-h, --help show this help message and exit
-v, --verbose python3 combination_embeddings.py -a -i <geoname data filename> <hierarchy geonames data filename>
-n NGRAM_SIZE, --ngram-size NGRAM_SIZE
-t TOLERANCE_VALUE, --tolerance-value TOLERANCE_VALUE ### Available parameters
-e EPOCHS, --epochs EPOCHS
-m {CNN,LSTM}, --model {CNN,LSTM}
\ No newline at end of file | Parameter | Description |
|----------------------|----------------------------------------------------------------------|
| -i,--inclusion | Use inclusion relationships to train the network |
| -a,--adjacency | Use adjacency relationships to train the network |
| -w,--wikipedia-coo | Use Wikipedia place co-occurrences to train the network |
| -n,--ngram-size | ngram size |
| -t,--tolerance-value | K-value in the computation of the accuracy@k |
| -e,--epochs | number of epochs |
| -d,--dimension | size of the ngram embeddings |
| -m,--model | Neural Network architecture used |
| --admin_code_1 | (Optional) If you wish to train the network on a specificate region |
# Base module # Base module
import re
import os import os
import sys import sys
from argparse import ArgumentParser from argparse import ArgumentParser
...@@ -40,6 +41,25 @@ def tqdm(*args, **kwargs): ...@@ -40,6 +41,25 @@ def tqdm(*args, **kwargs):
for instance in list(tqdm_base._instances): for instance in list(tqdm_base._instances):
tqdm_base._decr_instances(instance) tqdm_base._decr_instances(instance)
return tqdm_base(*args, **kwargs) return tqdm_base(*args, **kwargs)
def parse_title_wiki(title_wiki):
return re.sub("\(.*\)","",title_wiki).strip().lower()
def get_new_ids(cooc_data,id_first_value):
topo_id = {}
id_ = id_first_value
for title in cooc_data.title.values:
if not title in topo_id:
id_+=1
topo_id[id_]=title
for interlinks in cooc_data.interlinks.values:
for interlink in interlinks.split("|"):
if not interlink in topo_id:
id_+=1
topo_id[id_]=interlink
return topo_id
# Logging # Logging
import logging import logging
from chrono import Chronometer from chrono import Chronometer
...@@ -50,7 +70,7 @@ logging.basicConfig( ...@@ -50,7 +70,7 @@ logging.basicConfig(
) )
chrono = Chronometer() chrono = Chronometer()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("--admin_code_1 94 -n 2 -t 0.002 -e 100 -m LSTM -a -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
GEONAME_FN = args.geoname_input GEONAME_FN = args.geoname_input
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
...@@ -65,6 +85,11 @@ if args.model == "CNN": ...@@ -65,6 +85,11 @@ if args.model == "CNN":
else: else:
LSTM_train = True LSTM_train = True
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
# LOAD DATA # LOAD DATA
logging.info("Load Geonames data...") logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("") geoname_data = read_geonames(GEONAME_FN).fillna("")
...@@ -78,45 +103,74 @@ logging.info("Geonames data loaded!") ...@@ -78,45 +103,74 @@ logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places) # SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split()
region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1
if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth:
filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
# Geometry operation # Geometry operation
filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
filtered = gpd.GeoDataFrame(filtered) filtered = gpd.GeoDataFrame(filtered)
filtered["i"]=1 filtered["i"]=1
bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships
geoname2name = dict(filtered["geonameid name".split()].values)
rel_dict ={} rel_store = []
if args.adjacency: if args.adjacency:
logging.info("Retrieve inclusion relationships ! ") # RETRIEVE ADJACENCY REL
fn = "data/geonamesData/{0}_{1}_adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY) logging.info("Retrieve adjacency relationships ! ")
fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn)
if not os.path.exists(fn): if not os.path.exists(fn):
g = Grid(*bounds,[360,180]) g = Grid(*bounds,[360,180])
g.fit_data(filtered) g.fit_data(filtered)
[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))] [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])) rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])
json.dump(rel_dict,open(fn,'w')) json.dump(rel_store,open(fn,'w'))
else: else:
logging.info("Open and load data from previous computation!") logging.info("Open and load data from previous computation!")
rel_dict.update({int(k):int(v) for k,v in json.load(open(fn)).items()}) rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))]
logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_dict))) logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
if args.inclusion: if args.inclusion:
# RETRIEVE INCLUSION RELATIONSHIPS # RETRIEVE INCLUSION RELATIONSHIPS
logging.info("Retrieve inclusion relationships ! ") logging.info("Retrieve inclusion relationships ! ")
geoname2name = dict(filtered["geonameid name".split()].values)
filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values)) rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist()))
logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))
if args.wikipedia_cooc:
cooc_data = pd.read_csv("./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1],sep="\t")
cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max())
wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()}
# ENCODING NAME USING N-GRAM SPLITTING # ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...") logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE) index = NgramIndex(NGRAM_SIZE)
filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
if args.wikipedia_cooc:
[index.split_and_add(x) for x in id_wikipediatitle.values()]
idwiki_encoded = {id_: index.encode(toponym) for id_,toponym in id_wikipediatitle.items()}
max_len = max(max_len,max([len(enc) for _,enc in idwiki_encoded.items()]))
index.max_len = int(max_len) # For Index state dump
filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc:
idwiki_encoded = {id_: index.complete(enc,max_len) for id_,enc in idwiki_encoded.items()}
index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1]))
logging.info("Done !") logging.info("Done !")
#CLEAR RAM #CLEAR RAM
...@@ -129,6 +183,8 @@ filtered["cell_vec"]=filtered.apply( ...@@ -129,6 +183,8 @@ filtered["cell_vec"]=filtered.apply(
axis=1 axis=1
) )
geoname_vec = dict(filtered["geonameid cell_vec".split()].values) geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
if args.wikipedia_cooc:
wikipediaid_vec = {wikipediatitle_id[title]: zero_one_encoding(*title_coord[title]) for title in cooc_data.title.values}
# CLEAR RAM # CLEAR RAM
del filtered del filtered
...@@ -142,9 +198,8 @@ logging.info("Preparing Input and Output data...") ...@@ -142,9 +198,8 @@ logging.info("Preparing Input and Output data...")
X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
for geonameId_1,geonameId_2 in rel_dict.items(): for couple in rel_store:
if not geonameId_2 in rel_dict: geonameId_1,geonameId_2 = couple[0],couple[1]
continue
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
if geonameId_1 in train_indices: #and geonameId_2 in train_indices: if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
...@@ -190,7 +245,7 @@ def accuracy_at_k(y_true, y_pred): ...@@ -190,7 +245,7 @@ def accuracy_at_k(y_true, y_pred):
fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE) name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn)
if args.adjacency: if args.adjacency:
name+="_A" name+="_A"
if args.inclusion: if args.inclusion:
...@@ -263,4 +318,7 @@ if CONV : ...@@ -263,4 +318,7 @@ if CONV :
history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
hist_df = pd.DataFrame(history.history) hist_df = pd.DataFrame(history.history)
hist_df.to_csv("outputs/{0}.csv".format(name)) hist_df.to_csv("outputs/{0}.csv".format(name))
\ No newline at end of file
model.save("outputs/"+name+".h5")
...@@ -6,11 +6,13 @@ ...@@ -6,11 +6,13 @@
{ "short": "-v", "long": "--verbose", "action": "store_true" }, { "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" },
{"long": "--adjacency-iteration", "type":"int","default":10}, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
{"long": "--adjacency-iteration", "type":"int","default":50},
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" } { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" },
{ "long": "--admin_code_1", "default": "None" }
] ]
} }
\ No newline at end of file
#pyroutelib3 pyroutelib3
node2vec node2vec
#osrm osrm
geopandas geopandas
pandas pandas
numpy numpy
...@@ -15,3 +15,4 @@ keras ...@@ -15,3 +15,4 @@ keras
ngram ngram
shapely shapely
sqlitedict sqlitedict
nltk
\ No newline at end of file
import argparse
import numpy as np
import pandas as pd
import geopandas as gpd
import logging
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
from sklearn.model_selection import train_test_split
from shapely.geometry import Point
from utils import Grid
from helpers import read_geonames
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("geoname_file")
parser.add_argument("--feature_classes",help="List of class",default="A P")
args = parser.parse_args()#("data/geonamesData/FR.txt".split())
# LOAD DATAgeopandas
GEONAME_FN = args.geoname_file
FEATURE_CLASSES = args.feature_classes
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places
# World Shape bounds
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world["nn"] = 1
dissolved = world.dissolve(by="nn").iloc[0].geometry
#Creating Grid
logging.info("Initializing Grid (360,180)...")
g = Grid(*dissolved.bounds,[360,180])
logging.info("Fit Data to the Grid...")
g.fit_data(filtered)
logging.info("Placing place into the grid...")
[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))]
#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
logging.info("Associate a cell number to each place in the Geoname Dataframe")
def foo(g,id_):
for ix,cell in enumerate(g.cells):
if id_ in cell.list_object:
return ix
filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x))
# TRAIN AND TEST SPLIT
logging.info("Split Between Train and Test")
# Cell can be empty
i=0
while 1:
if len(filtered[filtered.cat == i])> 1:
X_train,X_test = train_test_split(filtered[filtered.cat == i])
break
i+=1
for i in range(i+1,len(g.cells)):
try:
x_train,x_test = train_test_split(filtered[filtered.cat == i])
X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
except:
pass #print("Error",len(filtered[filtered.cat == i]))
# SAVING THE DATA
logging.info("Saving Output !")
X_train.to_csv(GEONAME_FN+"_train.csv")
X_test.to_csv(GEONAME_FN+"_test.csv")
\ No newline at end of file
import math import math
import numpy as np import numpy as np
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import textwrap import textwrap
from ngram import NGram from ngram import NGram
import argparse import argparse
import os import os
import json import json
from tqdm import tqdm from tqdm import tqdm
import geopandas as gpd
from keras.layers import Embedding
from gensim.models import Word2Vec
from joblib import Parallel,delayed
from shapely.geometry import Point,box
class TokenizerCustom(): class TokenizerCustom():
...@@ -25,40 +33,113 @@ class TokenizerCustom(): ...@@ -25,40 +33,113 @@ class TokenizerCustom():
return seqs return seqs
from keras.layers import Embedding
from gensim.models import Word2Vec
class NgramIndex(): class NgramIndex():
"""
Class used for encoding words in ngram representation
"""
def __init__(self,n): def __init__(self,n):
self.ngram_gen = NGram(N=n) """
Constructor
Parameters
----------
n : int
ngram size
"""
self.ngram_gen = NGram(N=n)
self.size = n
self.ngram_index = {} self.ngram_index = {}
self.index_ngram = {} self.index_ngram = {}
self.cpt = 0 self.cpt = 0
self.max_len = 0 self.max_len = 0
def split_and_add(self,word): def split_and_add(self,word):
"""
Split word in multiple ngram and add each one of them to the index
Parameters
----------
word : str
a word
"""
ngrams = word.lower().replace(" ","$") ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams)) ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams] [self.add(ngram) for ngram in ngrams]
def add(self,ngram): def add(self,ngram):
"""
Add a ngram to the index
Parameters
----------
ngram : str
ngram
"""
if not ngram in self.ngram_index: if not ngram in self.ngram_index:
self.cpt+=1 self.cpt+=1
self.ngram_index[ngram]=self.cpt self.ngram_index[ngram]=self.cpt
self.index_ngram[self.cpt]=ngram self.index_ngram[self.cpt]=ngram
def encode(self,word): def encode(self,word):
"""
Return a ngram representation of a word
Parameters
----------
word : str
a word
Returns
-------
list of int
list of ngram index
"""
ngrams = word.lower().replace(" ","$") ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams)) ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ng) for ng in ngrams if not ng in self.ngram_index] [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return [self.ngram_index[ng] for ng in ngrams] return [self.ngram_index[ng] for ng in ngrams]
def complete(self,ngram_encoding,MAX_LEN,filling_item=0): def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
"""
Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
Parameters
----------
ngram_encoding : list of int
first encoding of a word
MAX_LEN : int
desired length of the encoding
filling_item : int, optional
ngram index you wish to use, by default 0
Returns
-------
list of int
list of ngram index
"""
assert len(ngram_encoding) <= MAX_LEN assert len(ngram_encoding) <= MAX_LEN
diff = MAX_LEN - len(ngram_encoding) diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff) ngram_encoding.extend([filling_item]*diff)
return ngram_encoding return ngram_encoding
def get_embedding_layer(self,texts,dim=100,**kwargs): def get_embedding_layer(self,texts,dim=100,**kwargs):
"""
Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
Parameters
----------
texts : list of [list of int]
list of encoded word
dim : int, optional
embedding dimension, by default 100
Returns
-------
np.array
embedding matrix
"""
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.ngram_index) N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim)) embedding_matrix = np.zeros((N,dim))
...@@ -66,19 +147,116 @@ class NgramIndex(): ...@@ -66,19 +147,116 @@ class NgramIndex():
embedding_matrix[i] = model.wv[str(i)] embedding_matrix[i] = model.wv[str(i)]
return embedding_matrix return embedding_matrix
def save(self,fn):
"""
Save the NgramIndex
Parameters
----------
fn : str
output filename
"""
data = {
"ngram_size": self.size,
"ngram_index": self.ngram_index,
"cpt_state": self.cpt,
"max_len_state": self.max_len
}
json.dump(data,open(fn,'w'))
@staticmethod
def load(fn):
"""
Load a NgramIndex state from a file.
Parameters
----------
fn : str
input filename
Returns
-------
NgramIndex
ngram index
Raises
------
KeyError
raised if a required field does not appear in the input file
"""
try:
data = json.load(open(fn))
except json.JSONDecodeError:
print("Data file must be a JSON")
for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = NgramIndex(data["ngram_size"])
new_obj.ngram_index = data["ngram_index"]
new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
new_obj.cpt = data["cpt_state"]
new_obj.max_len = data["max_len_state"]
return new_obj
def zero_one_encoding(long,lat): def zero_one_encoding(long,lat):
"""
Encode coordinates (WGS84) between 0 and 1
Parameters
----------
long : float
longitude value
lat : float
latitude value
Returns
-------
float,float
longitude, latitude
"""
return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0)
def _split(lst,n,complete_chunk_value): def _split(lst,n,complete_chunk_value):
"""
Split a list into chunk of n-size.
Parameters
----------
lst : list
input list
n : int
chunk size
complete_chunk_value : object
if last chunk size not equal to n, this value is used to complete it
Returns
-------
list
chunked list
"""
chunks = [lst[i:i + n] for i in range(0, len(lst), n)] chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
if not chunks:return chunks if not chunks:return chunks
if len(chunks[-1]) != n: if len(chunks[-1]) != n:
chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
return np.array(chunks) return np.array(chunks)
def generate_couple(object_list): def generate_couple(object_list):
"""
Return a randomly selected couple from an object list.
Parameters
----------
object_list : list
object list
Returns
-------
list
list of coupled object
"""
couples = [] couples = []
lst = np.arange(len(object_list)) lst = np.arange(len(object_list))
for _ in range(len(object_list)): for _ in range(len(object_list)):
...@@ -93,22 +271,73 @@ def generate_couple(object_list): ...@@ -93,22 +271,73 @@ def generate_couple(object_list):
return couples return couples
def _hash_couple(o1,o2): def _hash_couple(o1,o2):
"""
Return an hash for two object ids.
Parameters
----------
o1 : str or int
id of the first objeeect
o2 : str of int
id of the second object
Returns
-------
str
hash
"""
return "|".join(map(str,sorted([int(o1),int(o2)]))) return "|".join(map(str,sorted([int(o1),int(o2)])))
### GEO ADJAC BEGIN ### GEO ADJAC BEGIN
from joblib import Parallel,delayed
from shapely.geometry import Point,box
class Cell(object): class Cell(object):
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y): """
A cell is box placed in geeographical space.
"""
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,x,y):
"""
Constructor
Parameters
----------
object : [type]
[description]
upperleft_x : float
upperleft longitude
upperleft_y : float
upperleft latitude
bottomright_x : float
bottom right longitude
bottomright_y : float
bottom right latitude
x : int
cell x coordinates in the grid
y : int
cell y coordinates in the grid
"""
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
self.list_object={} # {id:Point(coord)} self.list_object={} # {id:Point(coord)}
self.x,self.y = x, y
def contains(self,lat,lon): def contains(self,lat,lon):
"""
Return true if the cell contains a point at given coordinates
Parameters
----------
lat : float
latitude
lon : float
longitude
Returns
-------
bool
true if contains
"""
x,y = lon,lat x,y = lon,lat
if x < self.upperleft_x or x > self.bottomright_x: if x < self.upperleft_x or x > self.bottomright_x:
return False return False
...@@ -117,13 +346,45 @@ class Cell(object): ...@@ -117,13 +346,45 @@ class Cell(object):
return True return True
def add_object(self,id_,lat,lon): def add_object(self,id_,lat,lon):
"""
Connect an object to the cell
Parameters
----------
id_ : int
id
lat : float
latitude
lon : float
longitude
"""
self.list_object[id_] = Point(lon,lat) self.list_object[id_] = Point(lon,lat)
def __repr__(self): def __repr__(self):
return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y) return "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
class Grid(object): class Grid(object):
"""
Define a grid
"""
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]): def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]):
"""
Constructor
Parameters
----------
upperleft_x : float
upperleft longitude
upperleft_y : float
upperleft latitude
bottomright_x : float
bottom right longitude
bottomright_y : float
bottom right latitude
cell_sub_div_index : list, optional
number of division in both latitude and longitude axis (longitude first), by default [100,50]
"""
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0] self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0]
...@@ -142,7 +403,7 @@ class Grid(object): ...@@ -142,7 +403,7 @@ class Grid(object):
self.upperleft_y+i*self.y_r, self.upperleft_y+i*self.y_r,
self.upperleft_x+((j+1)*self.x_r), self.upperleft_x+((j+1)*self.x_r),
self.upperleft_y+((i+1)*self.y_r), self.upperleft_y+((i+1)*self.y_r),
) j,i)
) )
dec_y = 0 dec_y = 0
for i in range(cell_sub_div_index[1]): for i in range(cell_sub_div_index[1]):
...@@ -153,34 +414,45 @@ class Grid(object): ...@@ -153,34 +414,45 @@ class Grid(object):
self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP
self.upperleft_y+(i*self.y_r)-dec_y, self.upperleft_y+(i*self.y_r)-dec_y,
self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r), self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r),
self.upperleft_y+((i+1)*self.y_r)+self.c_y_r#(self.u_neg*self.c_y_r), self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
) j,i)
) )
self.inter_cells[-1].append(Cell( self.inter_cells[-1].append(Cell(
self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER
self.upperleft_y+(i*self.y_r)-self.c_y_r, self.upperleft_y+(i*self.y_r)-self.c_y_r,
self.upperleft_x+((j+1)*self.x_r)+self.c_x_r, self.upperleft_x+((j+1)*self.x_r)+self.c_x_r,
self.upperleft_y+((i+1)*self.y_r)+self.c_y_r, self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
) j,i)
) )
self.inter_cells[-1].append(Cell( self.inter_cells[-1].append(Cell(
self.upperleft_x+(j*self.x_r)+dec_x, # CENTER self.upperleft_x+(j*self.x_r)+dec_x, # CENTER
self.upperleft_y+(i*self.y_r)-self.c_y_r, self.upperleft_y+(i*self.y_r)-self.c_y_r,
self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT
self.upperleft_y+((i+1)*self.y_r)+self.c_y_r self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
) j,i)
) )
dec_x = self.c_x_r dec_x = self.c_x_r
dec_y = self.c_y_r dec_y = self.c_y_r
def fit_data(self,data): def fit_data(self,data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))):
data["nn"] = 1 """
dissolved = data.dissolve(by="nn")
To avoid unnecessary check when connecting an entity to one or multiple cells, we
filter cells that does not appears in our geographic context (here countries surface).
Parameters
----------
data : GeoDataFrame
geographic context
"""
world = data
world["nn"] = 1
dissolved = world.dissolve(by="nn").iloc[0].geometry
new_cells= [] new_cells= []
new_inter_cells=[] new_inter_cells=[]
for i in tqdm(range(len(self.cells))): for i in tqdm(range(len(self.cells))):
for j in range(len(self.cells[i])): for j in range(len(self.cells[i])):
if dissolved.intersects(self.cells[i][j].box_).all(): if dissolved.intersects(self.cells[i][j].box_):
new_cells.append(self.cells[i][j]) new_cells.append(self.cells[i][j])
new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3]) new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3])
...@@ -188,7 +460,15 @@ class Grid(object): ...@@ -188,7 +460,15 @@ class Grid(object):
self.inter_cells = new_inter_cells self.inter_cells = new_inter_cells
def __add__(self,a): def __add__(self,a):
"""
Add an object to the grid
Parameters
----------
a : tuple
(id, latitude, longitude)
"""
for c1 in range(len(self.cells)): for c1 in range(len(self.cells)):
if self.cells[c1].contains(a[1],a[2]): if self.cells[c1].contains(a[1],a[2]):
self.cells[c1].add_object(*a) self.cells[c1].add_object(*a)
...@@ -199,6 +479,19 @@ class Grid(object): ...@@ -199,6 +479,19 @@ class Grid(object):
break break
def get_adjacent_relationships(self,random_iteration=10): def get_adjacent_relationships(self,random_iteration=10):
"""
Return a list of adjacent relationships founds in each cell.
Parameters
----------
random_iteration : int, optional
number of iteration for random selection of adjacency relationships, by default 10
Returns
-------
list
adjacency relationships
"""
relationships = set([]) relationships = set([])
for c1 in tqdm(range(len(self.cells))): for c1 in tqdm(range(len(self.cells))):
for i in range(random_iteration): for i in range(random_iteration):
...@@ -214,12 +507,6 @@ class Grid(object): ...@@ -214,12 +507,6 @@ class Grid(object):
### GEO ADJAC END ### GEO ADJAC END
import argparse
import os
import json
class ConfigurationReader(object): class ConfigurationReader(object):
def __init__(self,configuration_file): def __init__(self,configuration_file):
if not os.path.exists(configuration_file): if not os.path.exists(configuration_file):
...@@ -273,10 +560,6 @@ class ConfigurationReader(object): ...@@ -273,10 +560,6 @@ class ConfigurationReader(object):
if __name__ == "__main__": if __name__ == "__main__":
q = Quadtree(-180,90,180,-90)
hash_ = q.encode((1.2,1.3))
q.decode(hash_)
index = NgramIndex(3) index = NgramIndex(3)
index.split_and_add("J'aime le paté") index.split_and_add("J'aime le paté")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment