Skip to content
Snippets Groups Projects
Commit 35f9959b authored by Fize Jacques's avatar Fize Jacques
Browse files

Initial Commit

parent 76f085e4
No related branches found
No related tags found
No related merge requests found
Showing
with 1436 additions and 1 deletion
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
#### CUSTOM
data/*
deprecated/*
*.ipynb_checkpoints
notebooks/*
outputs/*
temp/*
WikipediaExtract/*
*.DS_Store
\ No newline at end of file
import json
import gzip
import argparse
import re
import pandas as pd
from joblib import Parallel, delayed
# To avoid progressbar issue
from tqdm import tqdm as tqdm_base
def tqdm(*args, **kwargs):
if hasattr(tqdm_base, '_instances'):
for instance in list(tqdm_base._instances):
tqdm_base._decr_instances(instance)
return tqdm_base(*args, **kwargs)
parser = argparse.ArgumentParser()
parser.add_argument("wikidata_json_dump_filename",help="Wikipedia JSON dump compressed with gzip (*.gz)")
parser.add_argument("output_filename")
args = parser.parse_args()
# Prepare Output File
output = open(args.output_filename,'w')
output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format("ID_WIKIDATA","title","url","latitude","longitude","classes"))
def job(line):
line = line.decode("utf-8")
if not "\"P625\"" in line or not "\"P31\"" in line:
return
try:
data = json.loads(line.strip(",\n"))
if "sitelinks" in data and "claims" in data:
if "enwiki" in data["sitelinks"]:
id_ = data["id"]
coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]
title = data["sitelinks"]["enwiki"]["title"]
url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_"))
lat = coords_data["latitude"]
lon = coords_data["longitude"]
classes_ = ""
for claimP31 in data["claims"]["P31"]:
classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"])
output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_")))
except Exception: # First Line is "['" and last line is "]'"
pass
Parallel(n_jobs=8,backend="multiprocessing")(delayed(job)(line)for line in tqdm(gzip.GzipFile(args.wikidata_json_dump_filename),unit_scale=True,unit_divisor=1000))
"""
grep -v "ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses" selectedPages.csv > selectedPages2.csv
{ echo -n 'ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses\n'; cat selectedPages2.csv; } > selectedPages3.csv
import pandas as pd
df = pd.read_csv("test.txt.new",sep="\t")
df
df.latitude
df
df.columns
nano test.txt.new
!nano test.txt.new
!nano test.txt.new
df = pd.read_csv("test.txt.new",sep="\t")
df.latitude
import geopandas as gpd
gdf = gpd.read_file("data/france/france_metro.geojson")
from shapely.geometry import Point
df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1)
df["geom"]=df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1)
gdf
gdf.iloc[0].geometry
france = gdf.iloc[0].geometry
from tqdm import tqdm
tqdm.pandas()
df.geom.progress_apply(lambda x : france.contains(x))
france.convex_hull
ff =france.convex_hull
df.geom.progress_apply(lambda x : ff.contains(x))
is_in_france = df.geom.progress_apply(lambda x : ff.contains(x))
df_new = df[is_in_france].copy()
df_new
del df_new["geom"]
df_new.to_csv("data/wikidata/sample/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_FRANCE")
!cp test.txt.new data/wikidata/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv
"""
# A run is done in ~1,18 hours (i7 2.8ghz, 16Gb RAM)
\ No newline at end of file
import gzip
import json
import re
import argparse
import pandas as pd
from joblib import Parallel,delayed
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("wikipedia_archive_filename",help="Filename of the Wikipedia corpus parsed with gensim")
parser.add_argument("wikidata_extraction",help="Output from the previous step")
parser.add_argument("output_file")
args = parser.parse_args()
try:
df_extract_wikidata = pd.read_csv(args.wikidata_extraction)#,header=None,names=["ID_WIKIDATA","title","url","latitude","longitude","classes"])
except:
df_extract_wikidata = pd.read_csv(args.wikidata_extraction,sep="\t")#,header=None,names=["ID_WIKIDATA","title","url","latitude","longitude","classes"])
titles = set(df_extract_wikidata.title.values)
coords_lat = dict(df_extract_wikidata["title latitude".split()].values)
coords_lon = dict(df_extract_wikidata["title longitude".split()].values)
class_ = dict(df_extract_wikidata["title classes".split()].values)
output = open(args.output_file,'w')
def job(line):
line = line.decode("utf-8")
data = json.loads(line)
if data["title"] in titles:
title = data["title"]
data["lat"] = coords_lat[title]
data["lon"] = coords_lon[title]
data["classes"] = class_[title]
output.write(json.dumps(data)+"\n")
Parallel(n_jobs = 8,backend="multiprocessing")(delayed(job)(line) for line in tqdm(gzip.GzipFile(args.wikipedia_archive_filename,'rb'),total=5980533))
\ No newline at end of file
import subprocess,os,json
import numpy as np
import time
import logging
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
from tqdm import tqdm
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("corpus_filename")
parser.add_argument("sampling_size",type=int)
parser.add_argument("output_filename")
args= parser.parse_args()
CORPUS_FILENAME = args.corpus_filename
SAMPLE_SIZE = args.sampling_size
# Compute the size of input corpus
logging.info("Computing the corpus size...")
wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ])
NUMBER_OF_INPUT=int(wc_l.strip().split()[-2])
logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT))
# Sampling
logging.info("Sampling...")
arr_ = np.arange(NUMBER_OF_INPUT)
sample = np.random.choice(arr_,SAMPLE_SIZE)
# Prepare Output file
output = open(args.output_filename,'w')
# Writing in the output file
logging.info("Writing the output...")
for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT):
if ix in sample:
output.write(line)
logging.info("Done !")
# Basic module
import time
import random
import json
import os
import sys
import argparse
# Data module
import numpy as np
import pandas as pd
from joblib import Parallel,delayed
# Keras basic
import keras
from keras import backend as K
from keras.initializers import Constant
# preprocessing
from sklearn import preprocessing
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
# Neural Network Model and layers class
from keras.layers import Dense, Input, GlobalAveragePooling1D, Embedding, LSTM, Bidirectional, Conv1D, GRU
from keras.models import Model
# Neural network model and visualisation function
from models import getModel,BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model
from helpers import plot_accuracy_from_history, save_embedding
# Utils
from utils import CoordinatesEncoder,TokenizerCustom,_split
# Logging
import logging
from chrono import Chronometer
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
chrono = Chronometer()
# Visualisation
import matplotlib.pyplot as plt
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("--glove_dir",default="data/glove")
parser.add_argument("--max_sequence_length",type=int, default=15)
parser.add_argument("--max_num_words",type=int, default=400000)
parser.add_argument("--embedding_dimension",type=int, default=100)
parser.add_argument("--batch_size",type=int, default=100)
parser.add_argument("--epochs",type=int, default=100)
parser.add_argument("-v",action="store_true",help="Display Keras training verbose")
def clean(x):
return x.lower().replace("\n","").replace("\'\'\'","").replace("\'\'","")
def split_data(input_data):
"""
Split the corpus into different list, each one corresponding to a feature(coordinates, type, textual data)
Parameters
----------
input_data : _io.TextIOWrapper
File instance
Returns
-------
tuple
lists of locations : name, coordinates, types and text data
"""
listLocations, listText, listCoords, listTypes = [], [], [], []
for line in input_data:
data = json.loads(line.strip("\n"))
listLocations.append(data["title"])
listText.append(clean(data["section_texts"][0])) # get intro
# listTypes.append(data["classes"] if data["classes"] else "Q5624766") # Default Populated Places
listTypes.append(dict_class_translate[data["classes"]]
if data["classes"] in dict_class_translate else "populated place")
listCoords.append([float(data["lat"]), float(data["lon"])])
return listLocations, listCoords, listTypes, listText
# PARSE ARGS
args = parser.parse_args()#("./WikipediaExtract/WikipediaExtract_filtered_300.json".split())
GLOVE_DIR = args.glove_dir
MAX_SEQUENCE_LENGTH = args.max_sequence_length # Size of the context window for word2vec CBOW
MAX_NUM_WORDS = args.max_num_words # Number of words in the vocabulary
EMBEDDING_DIM = args.embedding_dimension # Dimensionality for the word embeddings
BATCH_SIZE = args.batch_size # Size of the training batches of instances
EPOCHS = args.epochs
CORPUS_FILENAME = args.input
# SEARCH FOR UNIQUE TYPES
logging.info("Collecting class name")
# For class translation
dict_class_translate = json.load(open("data/wikidata/class_data/WikiClass2DPClass.json"))
classlabels = set([])
for line in tqdm(open(CORPUS_FILENAME), desc="Reading the line "):
data = json.loads(line.strip("\n"))
classlabels.add(dict_class_translate[data["classes"]]
if data["classes"] in dict_class_translate else "populated place")
#classlabels.add(data["classes"] if data["classes"] else "Q5624766")
classlabels = list(classlabels)
# LOAD DATA
logging.info("Loading the data...")
chrono.start("data_loading")
_, listCoords, listTypes, listText = split_data(open(CORPUS_FILENAME))
logging.info("Data Loaded in {0} seconds!".format(chrono.stop("data_loading")))
logging.info("Extract Vocab")
# Extract Vocab
vocab_ = set([" "])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(listText)
tokenizer.word_index[" "] = np.max(list(tokenizer.index_word.keys()))+1
vocab_ = vocab_.union(tokenizer.word_index.keys())
logging.info("The vocabulary contains {0} words".format(len(list(vocab_))))
logging.info("Initialize Tokenizer/ClassEncoder/CoordinateEncoder...")
# Tokenizer
#tokenizer = TokenizerCustom(list(vocab_))
max_key_tokenizer = np.max(list(tokenizer.index_word.keys()))
num_words = min(MAX_NUM_WORDS, len(tokenizer.word_index)) + 1
# Coordinate Encoder
coordinate_encoder = CoordinatesEncoder(2,2)
# CLASS ENCODER type-->int
type_encoder = preprocessing.LabelEncoder()
type_encoder.fit(classlabels)
logging.info("Parsing data for the neural network...")
chrono.start("parse_data")
X = tokenizer.texts_to_sequences(listText)
#X = X[:500] # Sample for tests
listCoords = np.array(listCoords)
y_lat = listCoords[:, 0]
y_lon = listCoords[:, 1]
new_X,Y_type,Y_coord = [],[],[]
for ix,x in tqdm(enumerate(X),total=len(X)):
text_sequence_splited = _split(x,MAX_SEQUENCE_LENGTH,0)
# Sub-Sequences vectors
new_X.extend(text_sequence_splited)
# coordinate vectors
new_coordinates=coordinate_encoder.vector_flatten(y_lat[ix],y_lon[ix])
Y_coord.extend([new_coordinates]*len(text_sequence_splited))
#type vectors
type_code = type_encoder.transform([listTypes[ix]])[0]
arr_ = np.zeros(len(type_encoder.classes_))
arr_[type_code]=1
Y_type.extend([arr_]*len(text_sequence_splited))
Y_coord = np.array(Y_coord)
Y_type = np.array(Y_type)
new_X = np.array(new_X)
logging.info("Data Parsed in {0} seconds!".format(chrono.stop("parse_data")))
for model_f in [BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model]:
name, model = getModel(model_f,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder,coordinate_encoder)
logging.info("Training the {0} model...".format(name))
chrono.start("model_training")
name = name + \
"_{0}dim_{1}epoch_{2}batch".format(EMBEDDING_DIM, EPOCHS, BATCH_SIZE)
history = model.fit(x=new_X,
y=[Y_type,Y_coord],#Y_lat,Y_lon],
validation_split=0.33,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
verbose=(1 if args.v else 0),
workers = 4,
#use_multiprocessing=True
)
logging.info("Model {0} have been trained in {1} seconds!".format(name,chrono.stop("model_training")))
hist_df = pd.DataFrame(history.history)
hist_df.to_csv("outputs/{0}.csv".format(name))
for layer_name in "place_type coord".split():
plt.clf()
plot_accuracy_from_history(
name, hist_df, layer_name, "outputs/{0}.png".format(name), "")
save_embedding(model, tokenizer, 0, "outputs/{0}.txt".format(name))
# Place Embedding
# INSTALL BASEMAP
```bash
brew install geos
pip3 install https://github.com/matplotlib/basemap/archive/master.zip
```
# GET DATA
## Process Wikipedia
python3 -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
## Process Wikidata
python3 extractInfoWikidata.py
## Fuse Data for training
python3 extractsubWikipedia.py
\ No newline at end of file
import time
class Chronometer():
def __init__(self):
self.__task_begin_timestamp = {}
def start(self,task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError("A running task exists with the name {0}!".format(task_name))
self.__task_begin_timestamp[task_name] = time.time()
def stop(self,task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
if __name__ == "__main__":
chrono = Chronometer()
chrono.start("test")
chrono.start("test2")
time.sleep(3)
print(chrono.stop("test"))
time.sleep(3)
print(chrono.stop("test2"))
\ No newline at end of file
# Evaluation process
import gensim
import glob
import re
import gensim
import random
from helpers import *
from scipy.spatial.distance import cosine
from shapely.geometry import Point
from scipy.stats.stats import pearsonr
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
NPAIR = 100000
fns = glob.glob("data/embeddings/*.bin")
def get_data(fn):
data = [int(x) for x in re.findall("\d+",fn)]
if not len(data) == 4:
return {"embedding_size":data[0],
"walk_length":data[1],
"number_of_walks":data[2],
"word2vec_window_size":data[3],
"filepath":fn,
"noise":data[4]
}
#raise Exception("filename should have 4 integers")
return {
"embedding_size":data[0],
"walk_length":data[1],
"number_of_walks":data[2],
"word2vec_window_size":data[3],
"filepath":fn
}
df = read_geonames("./data/geonamesData/FR.txt")
df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
# Create GeoDataFrame for faster spatial comparison operations
gdf = gpd.GeoDataFrame(df)
# Select a sample that concerns the departement "La Manche"
manche_gdf = gdf[gdf.admin2_code == "50"]
df =pd.DataFrame([get_data(fn) for fn in fns])
def get_pearsons(model):
manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
places = list(coords.keys())
geodesic_d = []
embeddings_d = []
for i in tqdm(range(NPAIR),disable=True):
placeA=random.choice(places)
placeB=random.choice(places)
geodesic_d.append(coords[placeA].distance(coords[placeB]))
embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
df.plot.scatter(x="walk_length", y="pearson")
plt.show()
df.plot.scatter(x="number_of_walks", y="pearson")
plt.show()
df.plot.scatter(x="word2vec_window_size", y="pearson")
plt.show()
\ No newline at end of file
# PYTHON MODULE
import math
import random
from argparse import ArgumentParser
from multiprocessing import cpu_count
from argparse import RawTextHelpFormatter
# COMMON DATA STRUCTURE MODULE
import pandas as pd
import numpy as np
import networkx as nx
# SPATIAL DATA MANIPULATION
import geopandas as gpd
import osrm
osrm.RequestConfig.host = "jacquesfize.com:5000"
from shapely.geometry import Point
# DISTANCE MODULE
from scipy.spatial.distance import cosine
from scipy.stats.stats import pearsonr
# Machine Learning MODULE
from node2vec import Node2Vec
import gensim
# VISUALISATION MODULE
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
# PERSONAL FUNCTION
from helpers import *
parser = ArgumentParser(description='Generate a spatial embedding of places using Geonames data', formatter_class=RawTextHelpFormatter)
parser.add_argument("input")
parser.add_argument("--nbcpu",type=int,default=cpu_count())
parser.add_argument("--vector-size",type=int,default=64,help="Output Vector Dimension")
parser.add_argument("--walk-length",type=int,default=30, help="Size of the walk generated during the Node2vec algorithm")
parser.add_argument("--num-walks",type=int,default=200, help="Number of walk generated during the Node2vec algorithm")
parser.add_argument("--word2vec-window-size",type=int,default=30, help="Window size used in the Word2vec algorithm")
parser.add_argument("--buffer-size",type=float,default=0.03,help="Buffer size to transform Point in Polygon. Used for adjacency matrix computation.")
parser.add_argument("-d",action="store_true",help="Integrate the distance weight between vertices")
parser.add_argument("--dist",choices=["euclidean","itinerary"],default="itinerary",help="""Two distance functions are available:
- Euclidean : Euclidean distance between the two places centroids
- Itinerary : Compute the itinerary distance between two places using an OSRM service
""")
parser.add_argument("--noise",action="store_true")
parser.add_argument("--noise-size",type=int,default=500)
args = parser.parse_args()
# INPUT DATA
GEONAMES_FN = args.input
# PARALLELISM OPTION
NUMBER_OF_CPU_USED = args.nbcpu
# Graph Embedding parameter
VECTOR_SIZE = args.vector_size
WALK_LENGTH = args.walk_length
NUMBER_OF_WALK = args.num_walks
WORD2VEC_WINDOW = args.word2vec_window_size
# GRAPH WEIGHT PARAMETER
IS_DISTANCE = args.d
DISTANCE = args.dist
# if simulation of new toponyms
GEO_DISTANCE_COEF = 0.5
EMBEDDING_DISTANCE_COEF = 0.5
# New toponym simulation
IS_NOISE = args.noise
NUMBER_OF_NODE_DESPATIALIZED = args.noise_size
# DISTANCE CACHE STORAGE
from sqlitedict import SqliteDict
distance_dict = SqliteDict('./data/distance_dict.sqlite', autocommit=True)
# LOAD GEONAMES DATA
df = read_geonames(GEONAMES_FN)
df["geometry"] = df["latitude longitude".split()].progress_apply(lambda x:Point(x.longitude,x.latitude),axis=1)
# Create GeoDataFrame for faster spatial comparison operations
gdf = gpd.GeoDataFrame(df)
# Select a sample that concerns the departement "La Manche"
manche_gdf = gdf[gdf.admin2_code == "50"]
manche_gdf["geometry"]=manche_gdf.geometry.buffer(0.03)
manche_gdf.plot()
# plt.show()
# Build a Adjacency matrix to generate the graph used for the embedding generation
N = len(manche_gdf)
adjacency_matrix = np.zeros((N,N))
geometries = manche_gdf.geometry.values
for i in tqdm(range(N)):
for j in range(i,N):
adjacency_matrix[i,j] = geometries[i].intersects(geometries[j])
adjacency_matrix[j,i] = adjacency_matrix[i,j]
plt.clf()
plt.imshow(adjacency_matrix);plt.colorbar()
# plt.show()
# Mapping id between matrix and geonameid
manche_gdf["code_matrix"]=np.arange(N)
geoname_id2idxmat = dict(manche_gdf["geonameid code_matrix".split()].values)
idxmat2geoname_id = {v:k for k,v in geoname_id2idxmat.items()}
# Add adjacent entity found in the Geodataframe
def get_adjacent_entity(x):
idxs = np.nonzero(adjacency_matrix[geoname_id2idxmat[x]])[0]
return [idxmat2geoname_id[idx] for idx in idxs if not idxmat2geoname_id[idx] == x] # take not itself
manche_gdf["adjacent_entity"]=manche_gdf.geonameid.apply(get_adjacent_entity)
# Code for getting the distance using the road network (not euclidean) PART 1
manche_gdf["geometry_centroid"]=manche_gdf.centroid
coords = dict(manche_gdf["geonameid geometry_centroid".split()].values)
# Code for getting the distance using the road network (not euclidean) PART 2
# Run ORSM SERVER
#https://hub.docker.com/r/osrm/osrm-backend/
#docker run -t -p 5000:5000 -v $(pwd):/data osrm/osrm-backend osrm-extract -p /opt/car.lua /data/road.pbf
#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/road.osrm
#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/road.osrm
#docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/road.osrm
# Check Also : https://github.com/ustroetz/python-osrm#route
# Test: curl 'http://<yourserver>:5000/route/v1/driving/49.38,-1.37;49,-1.37?steps=true'
def getTupCoords(id_):
return [coords[id_].x,coords[id_].y]
def getDistance(id_1,id_2):
try:
return osrm.simple_route(getTupCoords(id_1), getTupCoords(id_2), output="route", overview="full",steps=False,geometry="wkt")[0]["distance"]
except IndexError:
return -1
def signature(id_1,id2):
return "_".join([str(id_)for id_ in sorted([id_1,id2])])
def getDistanceSDict(id_1,id_2,sqlite_dict):
hash_ = signature(id_1,id_2)
if not hash_ in sqlite_dict:
sqlite_dict[hash_]=getDistance(id_1,id_2)
return sqlite_dict[hash_]
from joblib import Parallel,delayed # for parallel job computation
def job(G,row,adjacent):
new_edge = (row.geonameid,adjacent)
if not G.has_edge(*new_edge):
if IS_DISTANCE and DISTANCE == "itinerary":
return (*new_edge,getDistanceSDict(new_edge[0],new_edge[1],distance_dict))
elif IS_DISTANCE and DISTANCE == "euclidean":
raise NotImplementedError()
else:
return 1
# Using Route Distance
G = nx.Graph()
for ix,row in tqdm(manche_gdf["geonameid adjacent_entity".split()].iterrows(),total=len(manche_gdf)):
new_edges = Parallel(n_jobs=4,backend="threading")(delayed(job)(G,row,adjacent) for adjacent in row.adjacent_entity)
for edge in new_edges:
if edge:
G.add_edge(edge[0],edge[1],weight=edge[2])
# Data for graph projection
lon_dict= dict(manche_gdf["geonameid longitude".split()].values)
lat_dict= dict(manche_gdf["geonameid latitude".split()].values)
pos= {n:[lon_dict[n],lat_dict[n]]for n in G.nodes()}
nx.draw(G,pos=pos,node_size=1)
# plt.show()
for ed in list(G.edges()):
G[ed[0]][ed[1]]["weight"]+=1 # problem when G[ed[0]][ed[1]]["weight"]==0:
if IS_NOISE:
H = G.copy()
edges,weights = zip(*nx.get_edge_attributes(H,'weight').items())
sample = random.sample(list(H.nodes()),NUMBER_OF_NODE_DESPATIALIZED)
H.remove_nodes_from(sample)
pos= {n:[lon_dict[n],lat_dict[n]] for n in H.nodes()}
nx.draw(H,pos=pos,node_size=1)
# plt.show()
label_dict = dict(manche_gdf["geonameid name".split()].values)
embeddings = dict(pd.read_msgpack("data/embeddings/geonamesFRWithEmbeddings.msg")["geonameid embedding".split()].values)
ids,emb = zip(*embeddings.items())
id2geonameid = dict(enumerate(ids))
geonameid2id = {id_:ix for ix, id_ in enumerate(ids) }
emb_matrix = np.asarray(emb)
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(emb)
top_n = np.argsort(sim_matrix)[:,-3:]
for ix,n in enumerate(sample):
top_i = [id2geonameid[top] for top in top_n[geonameid2id[n]]]
weights_i = [sim_matrix[geonameid2id[n]][top]for top in top_n[geonameid2id[n]]]
for ij,top_ij in enumerate(top_i):
H.add_edge(n,top_ij,weight=weights_i[ij])
G = H.copy()
node2vec = Node2Vec(G, dimensions=VECTOR_SIZE, walk_length=WALK_LENGTH, num_walks=NUMBER_OF_WALK, workers=NUMBER_OF_CPU_USED,temp_folder="temp") # Use temp_folder for big graphs
model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_CPU_USED)
# Saving the embedding model
if not IS_NOISE:
model.save("manche_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
walk_l = WALK_LENGTH,
num_walk = NUMBER_OF_WALK,
window = WORD2VEC_WINDOW))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
else:
model.save("manche_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
walk_l = WALK_LENGTH,
num_walk = NUMBER_OF_WALK,
window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED))
\ No newline at end of file
import pandas as pd
import matplotlib.pyplot as plt
import os
def read_geonames(file):
"""
Return a dataframe that contains Geonames data.
Parameters
----------
file : str
path of the Geonames Csv file
Returns
-------
pd.DataFrame
geonames data
"""
dtypes_dict = {
0: int, # geonameid
1: str, # name
2: str, # asciiname
3: str, # alternatenames
4: float, # latitude
5: float, # longitude
6: str, # feature class
7: str, # feature code
8: str, # country code
9: str, # cc2
10: str, # admin1 code
11: str, # admin2 code
12: str, # admin3 code
13: str, # admin4 code
14: int, # population
15: str, # elevation
16: int, # dem (digital elevation model)
17: str, # timezone
18: str # modification date yyyy-MM-dd
}
rename_cols = {
0:"geonameid", # geonameid
1:"name", # name
2:"asciiname", # asciiname
3:"alternatenames", # alternatenames
4:"latitude", # latitude
5:"longitude", # longitude
6:"feature_class", # feature class
7:"feature_class", # feature code
8:"country_code", # country code
9:"cc2", # cc2
10:"admin1_code", # admin1 code
11:"admin2_code", # admin2 code
12:"admin3_code", # admin3 code
13:"admin4_code", # admin4 code
14:"population", # population
15:"elevation", # elevation
16:"dem", # dem (digital elevation model)
17:"timezone", # timezone
18:"modification_date" # modification date yyyy-MM-dd
}
data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
data.rename(columns=rename_cols,inplace=True)
return data
def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False):
# Plot training & validation loss values
plt.gcf()
plt.gca()
plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data")
if validation:
plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data")
plt.title('Layer {0} accuracy'.format(output_layer_name))
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.ylim((0,1.1)) #1.1 if accuracy = 1
plt.legend()
plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,))
if show :
plt.show()
def save_embedding(model,tokenizer,layer_idx,fn):
embedding_matrix = model.get_weights()[0]
with open(os.path.join(fn), 'w') as f:
for word, i in tokenizer.word_index.items():
f.write(word)
for i in embedding_matrix[i]: f.write(' ' + repr(i))
f.write('\n')
from keras import Model
from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D, Dropout
# name,model_2=MPC_model(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder)
# model_2.fit(x=new_X,y=[Y_type,Y_lat,Y_lon],validation_split=0.33,epochs=EPOCHS,batch_size=BATCH_SIZE,verbose=1)
def getModel(model_func,max_sequence_length,embedding_dim,num_words,class_encoder,coordinate_encoder):
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedding_layer = Embedding(num_words, embedding_dim, input_length=max_sequence_length, trainable=True)
x = embedding_layer(sequence_input)
name,x = model_func(x)
placetype_layer = Dense(len(class_encoder.classes_), activation='softmax',name="place_type")(x) # From the transformation, attempt to predict the place type
coordinates_layer = Dense(coordinate_encoder.oneDimensionOutputSize(), activation='softmax',name="coord")(x)
model = Model(sequence_input, [ placetype_layer , coordinates_layer] )
model.compile(loss=['categorical_crossentropy','categorical_crossentropy'], optimizer='adam',metrics=["accuracy"])
return name,model
def WEAverage_model(x):
name = "WordEmb_Average"
x = GlobalAveragePooling1D()(x)
return name,x
def MPC_WEAverage_model(x):
name = "MPC_WordEmb_Average"
x = GlobalAveragePooling1D()(x)
x = Dense(2000)(x)
x = Dense(2000)(x)
return name,x
def BI_LSTM_model(x):
name = "Bi-LSTM"
x = (Bidirectional(LSTM(64)))(x)
return name,x
def BI_GRU_model(x):
name = "Bi-GRU"
x = (Bidirectional(GRU(64)))(x)
return name, x
import pandas as pd
df= pd.read_csv("dbpediaPlaceClassification.csv")
import numpy as np
def loadGloveModel(gloveFile):
print("Loading Glove Model")
f = open(gloveFile,'r')
model = {}
for line in f:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print("Done.",len(model)," words loaded!")
return model
model = loadGloveModel("data/glove/glove.6B.100d.txt")
def getEmb(x,model):
emb = np.zeros(100)
for word in x.split():
word =word.lower()
if word in model:
emb+=model[word]
return emb
df["embeddings"] = df["Place"].apply(lambda x : getEmb(x,model))
df.to_msgpack("dbpediaPlaceEmbedding.msg")
import json
data = json.load(open("classname.json"))
df2 = pd.DataFrame(data.items(),columns="WID label".split())
df2["embeddings"] = df2["label"].apply(lambda x:getEmb(x,model))
df2.to_msgpack("classnameEmbedding.msg")
\ No newline at end of file
import os
import random
import numpy as np
from owlready2 import *
from vincenty import vincenty
from scipy.stats.stats import pearsonr
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '.')
NUM_PAIRS = 100000
print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.')
embeddings_index = {}
listText = []
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f:
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = np.fromstring(coefs, 'f', sep=' ')
embeddings_index[word] = coefs
listText.append(word)
print('Found %s word vectors.' % len(embeddings_index))
print('Collecting data from DBPedia ontology, downloaded from http://downloads.dbpedia.org/2014/dbpedia_2014.owl.bz2.')
onto = get_ontology("dbpedia_2014.owl")
onto.load()
def retrive_desc( concept , old_names=[] ):
desc = list( concept.descendants( include_self=False) )
names = list( [ re.sub(r'.+\.', '', repr(concept)) + "/" + re.sub(r'.+\.', '', repr(c)) for c in desc ] )
names = [x for x in names if x not in old_names]
desc = [ desc[x] for x in range(len(names)) if names[x] not in old_names ]
new_desc = list(desc)
for i in desc:
n1, d1 = retrive_desc(i, names + old_names)
for j in range(len(n1)):
new_desc.append( d1[j] )
names.append( re.sub(r'.+\.', '', repr(concept)) + "/" + n1[j] )
return names, new_desc
names, _ = retrive_desc(onto.Place)
names = [ n.lower() for n in set(names) if re.sub(r'.+/', '', n.lower()) in embeddings_index ] # check if the name of the place type exists in the embeddings matrix
print('Generating pairs of place names.')
name_pairs = []
similarity_pairs = []
distance_pairs = []
for num in range(NUM_PAIRS):
name1 = random.choice(names)
name2 = random.choice(names)
if name1 == name2: continue
name_pairs.append( (name1,name2) )
dist = 0.0
n1 = name1.split('/')
n2 = name2.split('/')
for i in range(min(len(n1),len(n2))):
if n1[i] == n2[i]: dist += 1.0
else: break
similarity_pairs.append(dist) # Similarity between the place types, given by the number of ancestors in common
distance_pairs.append( np.sqrt(np.sum((embeddings_index[re.sub(r'.+/', '', name1)] - embeddings_index[re.sub(r'.+/', '', name2)])**2)) ) # Euclidean distance between the embeddings
result = pearsonr( distance_pairs , similarity_pairs) # Compute Pearson correlation and associated p-value
print(result)
import os
import random
import numpy as np
from vincenty import vincenty
from scipy.stats.stats import pearsonr
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'data/glove')
NUM_PAIRS = 100000
print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.')
embeddings_index = {}
listText = []
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f:
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = np.fromstring(coefs, 'f', sep=' ')
embeddings_index[word] = coefs
listText.append(word)
print('Found %s word vectors.' % len(embeddings_index))
print('Collecting data from geonames downloaded from http://download.geonames.org/export/dump/allCountries.zip.')
file = open("data/geonamesData/allCountries.txt", "r")
placenames = { }
for line in file:
line = line.split("\t")
name = line[1].lower()
if " " in name or not(name in embeddings_index): # check if the main name exists in the embeddings matrix
names = line[3].split(",")
for n in names:
n = n.strip().lower()
if not(" " in n) and (n in embeddings_index): # if not, check if any of the alternative names exists in the embeddings matrix
name = n
break
if " " in name or not(name in embeddings_index): continue
placenames.update( { name : (float(line[4]), float(line[5])) } )
from scipy.spatial.distance import cosine
from tqdm import tqdm
print('Generating pairs of place names.')
NUM_PAIRS = 1000
name_pairs = []
geo_distance_pairs = []
distance_pairs = []
for num in tqdm(range(NUM_PAIRS)):
name1 = random.choice(list(placenames.keys()))
name2 = random.choice(list(placenames.keys()))
if name1 == name2: continue
name_pairs.append( (name1,name2) )
try:
distance_pairs.append(cosine(embeddings_index[name1], embeddings_index[name2]))
geo_distance_pairs.append( vincenty(placenames[name1], placenames[name2]) ) # Geospatial distance between the place names, given by Vincenty's geodetic formulae # Cosine distance between the embeddings
except:
pass
geo_distance_pairs= np.array(geo_distance_pairs).astype(float)
distance_pairs = np.nan_to_num(distance_pairs,nan=np.nanmax(distance_pairs))
geo_distance_pairs = np.nan_to_num(geo_distance_pairs,nan=1)
result = pearsonr( geo_distance_pairs , distance_pairs) # Compute Pearson correlation and associated p-value
print(result)
\ No newline at end of file
import json
import argparse
import time
from SPARQLWrapper import SPARQLWrapper,JSON
from urllib.request import HTTPError
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("available_class_filename",help="JSON file that contains an array of string. Each string is a Wikidata id (e.g. Q30)")
parser.add_argument("output_filename")
args = parser.parse_args()
ids= json.load(open(args.available_class_filename))
def get_label(id_wikidata):
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
select *
where {
wd:"""+id_wikidata+ """ rdfs:label ?label .
FILTER (langMatches( lang(?label), "EN" ) )
}
LIMIT 1""" )
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
time.sleep(0.1)
try:
return results["results"]["bindings"][0]["label"]["value"]
except:
return ""
t = 0
dict_results = {}
progress_bar = tqdm(total=len(ids))
while t<len(ids):
try:
dict_results[ids[t]]=get_label(ids[t])
except HTTPError as e:
time.sleep(1)
continue
progress_bar.update(1)
t+=1
progress_bar.close()
json.dump(dict_results,open(args.output_filename,'w'))
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
TOPN = 100
df = pd.read_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv",sep="\t",names="ID title url latitude longitude instance_of".split())[1:]
df = df.fillna("")
df["instance_of"] = df.instance_of.apply(lambda x: str(x).split("_"))
count = {}
for list_type in df.instance_of.values:
#print(list_type)
for type_ in list_type:
if not type_ in count: count[type_]=0
count[type_]+=1
# modify count for CLASS IMPORTANT BUT NOT FREQUENT BECAUSE OF A HIGH GRANULARITY
to_increase = [
"Q6256",#country
"Q5119",#capital
"Q27554677",#former capital
"Q10864048", # ADM. DIV. 1
"Q13220204", # ADM. DIV. 2
"Q13220204", # ADM. DIV. 3
"Q14757767", # ADM. DIV. 4
"Q82794" # geographic region
]
inf_ = np.max(list(count.values()))
for type_ in to_increase:
count[type_] = inf_+1
print("Dataframe contains",len(df),"entities")
count_df = df.from_dict(count,orient="index").reset_index().sort_values(0,ascending=False)
class_filtered = set(count_df.head(TOPN)["index"].values)
#Q15640612 #5
#Q22927291 #6
df = df[df.instance_of.apply(lambda x: sum(True for i in x if i in class_filtered)>0)]
def getMostFrequentClass(x):
idx = np.argsort([count[i] for i in x])[-1]
return x[idx]
df["type"] = df.instance_of.apply(getMostFrequentClass)
df.to_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_filteredTop{0}class".format(TOPN))
import fasttext
print("Load Model Fasttext FR")
model = fasttext.load_model("./data/fasttext_FR/wiki.fr.bin")
print("Model Loaded !")
import pandas as pd
def read_geonames(file):
dtypes_dict = {
0: int, # geonameid
1: str, # name
2: str, # asciiname
3: str, # alternatenames
4: float, # latitude
5: float, # longitude
6: str, # feature class
7: str, # feature code
8: str, # country code
9: str, # cc2
10: str, # admin1 code
11: str, # admin2 code
12: str, # admin3 code
13: str, # admin4 code
14: int, # population
15: str, # elevation
16: int, # dem (digital elevation model)
17: str, # timezone
18: str # modification date yyyy-MM-dd
}
rename_cols = {
0:"geonameid", # geonameid
1:"name", # name
2:"asciiname", # asciiname
3:"alternatenames", # alternatenames
4:"latitude", # latitude
5:"longitude", # longitude
6:"feature_class", # feature class
7:"feature_class", # feature code
8:"country_code", # country code
9:"cc2", # cc2
10:"admin1_code", # admin1 code
11:"admin2_code", # admin2 code
12:"admin3_code", # admin3 code
13:"admin4_code", # admin4 code
14:"population", # population
15:"elevation", # elevation
16:"dem", # dem (digital elevation model)
17:"timezone", # timezone
18:"modification_date" # modification date yyyy-MM-dd
}
data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
data.rename(columns=rename_cols,inplace=True)
return data
data = read_geonames("./data/geonamesData/FR.txt")
data= data.fillna("")
data = data[data.admin2_code == "50"]
data["embedding"] = data["name"].apply(lambda x : model[x])
print(data)
data.to_msgpack("geonamesFRWithEmbeddings.msg")
\ No newline at end of file
import subprocess,os,json
import numpy as np
import time
import json
ids= json.load(open("classavailable.json"))
from SPARQLWrapper import SPARQLWrapper,JSON
def get_label(id_wikidata):
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
select *
where {
wd:"""+id_wikidata+ """ rdfs:label ?label .
FILTER (langMatches( lang(?label), "EN" ) )
}
LIMIT 1""" )
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
time.sleep(0.1)
try:
return results["results"]["bindings"][0]["label"]["value"]
except:
return ""
from urllib.request import HTTPError
from tqdm import tqdm
t = 0
dict_results = {}
pbar = tqdm(total=len(ids))
while t<len(ids):
try:
dict_results[ids[t]]=get_label(ids[t])
except HTTPError as e:
time.sleep(1)
continue
pbar.update(1)
t+=1
pbar.close()
\ No newline at end of file
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 100 --walk-length 30 --num-walks 200 --word2vec-window-size 30
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 200 --walk-length 30 --num-walks 200 --word2vec-window-size 30
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 300 --walk-length 30 --num-walks 200 --word2vec-window-size 30
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 400 --walk-length 30 --num-walks 200 --word2vec-window-size 30
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 500 --walk-length 30 --num-walks 200 --word2vec-window-size 30
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 600 --walk-length 30 --num-walks 200 --word2vec-window-size 30
python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 700 --walk-length 30 --num-walks 200 --word2vec-window-size 30
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment