Skip to content
Snippets Groups Projects
Commit c23a6cce authored by Fize Jacques's avatar Fize Jacques
Browse files

debug and start to write Readme

parent 635dffad
No related branches found
No related tags found
No related merge requests found
# linkprediction_depo
# What model enable to understand better a graph entities interactions through link prediction
This repository contains source code to research concerning linkprediction
# Installation
To use the code in this repo, first you'll need to install Python requirements using the following command
```shell
pip install -r requirements.txt
```
Then, install our custom version of EvalNE using pip
```shell
pip install git+https://github.com/thunlp/OpenNE.git
pip install git+https://github.com/Jacobe2169/EvalNE
```
Finally, install the Python library [graph-tool](https://graph-tool.skewed.de).
# First experience: Does a graph model still is the best to predict itself ?
First hand, it's important to indicate why we want to do this ! For a better understanding of a network, we
need to know what model structured it. One way of doing that is through link prediction. Our hypothesis is that
if a specific model is able to catch how most of vertices are linked then the graph is structured by it. One way
to verify this hypothesis is to evaluate a model in a link prediction task.
In this experiment, we evaluate different link prediction methods from heuristics to deep learning methods. For that
we use a custom version of EvalNE for evaluating link prediction methods. We developed a serie of functions for generating
graph based on the following models:
* Spatial model
* Stochastic Block Model
* Configuration Model
* Random (ER)
## Generate a graph using the library
All graph generator can be found in the module `lib.random`. For every graph generator, you can set the number of edges and nodes
in the resulting graph.
For example, if you want to generate a graph following the stochastic block model, use the follwing code :
```python
from lib.random import stochastic_block_model_graph
G = stochastic_block_model_graph(nb_nodes=300,nb_edges=1200,nb_com=5,percentage_edge_betw=0.01)
```
If you wish to generate a dataset containing generated graph with different configurations, you can use the script
`generate_random_graph.py` using the following command :
```shell
python generate_theoric_random_graph.py <output_dir>
```
You can modify the parameters of each configuration for each model in the script source code.
## Evaluation
To run the evaluation of link prediction method on a dataset, use the `run_eval.py` script:
```shell
python run_eval.py <graph_datasset_dir> <output_dir> [-f <graph_file_format(gexf, gml or txt)> ][-t <train_frac>] [-v verbose]
```
# Second, What about mixed model graphs ?
## Generate graph using a mixture of model
```python
from lib.random import mixed_model_spat_sbm
G = mixed_model_spat_sbm(nb_nodes=300,nb_edges=1600,nb_com=3,alpha=0.3)
#Here alpha is to quantify fraction of the edges selected using the SBM model
```
## Evaluate our link prediction method on mixed model
(TODO Explain the erosion model)
```shell
python eval_mixed_model.py <graph_datasset_dir> <output_dir> [-f <graph_file_format(gexf, gml or txt)> ][-t <train_frac>] [-v verbose]
```
# Authors
Jacques Fize, Rémy Cazabet
File moved
......@@ -18,7 +18,7 @@ args = parser.parse_args()
GRAPH_SIZE = [80,800,5000]
EDGE_SIZE = [2,4,5,10]
EDGE_FACTOR = [2,4,5,10] # the number of edges is computed by multiplicating the edge factor with the number of nodes.
sample_per_params = 10
OUTPUT_DIR = args.output_dir
......@@ -29,28 +29,28 @@ if not os.path.exists(OUTPUT_DIR):
parameters = {
"stochastic_block_model_graph": {
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"nb_edges":EDGE_FACTOR,
"nb_com" :[2,5,8,16,10,25],
"percentage_edge_betw":[0.1,0.01]
},
"ER_graph": {
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE
"nb_edges":EDGE_FACTOR
},
"powerlaw_graph": { # configuration_model
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"nb_edges":EDGE_FACTOR,
"exponent":[2,3],
"tries":[100]
},
"spatial_graph":{
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"nb_edges":EDGE_FACTOR,
"coords":["random","country"],
},
"mixed_model_spat_sbm":{
"nb_nodes":GRAPH_SIZE,
"nb_edges":EDGE_SIZE,
"nb_edges":EDGE_FACTOR,
"nb_com":[2,4,8,16],
"alpha":[0,0.01,0.1,0.5,0.7,1]
......
# coding = utf-8
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges
from .random import get_spat_probs, get_sbm_probs
......@@ -20,6 +21,18 @@ def log(x):
if VERBOSE:
print(x)
def probs_computation_based_on_weight(weights,n=100000):
a = np.copy(weights)
idx_vals = np.arange(len(a))
res = np.zeros(len(a))
for i in range(n):
idxrand = np.random.choice(idx_vals,1,p=a)
res[idxrand] = res[idxrand] + 1
res/=n
return res
class ErosionModel():
def __init__(self, G):
self.G = G
......@@ -43,7 +56,7 @@ class ErosionModel():
old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values)
auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60)
if VERBOSE:print(auc_sbm,auc_spatial)
if VERBOSE:print("SBM AUC",auc_sbm,"SPATIAL AUC",auc_spatial)
edges = get_all_possible_edges(self.H)
if auc_sbm > auc_spatial:
probs = stochastic_block_model(self.H, edges)
......@@ -52,8 +65,12 @@ class ErosionModel():
edges = np.asarray(edges)
probs_dom = np.asarray(probs)
probs_dom = probs_computation_based_on_weight(probs_dom/probs_dom.sum())
sum_prob_dom = probs_dom.sum()
sum_prob_dom_H = sum([probs[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])
#store the model
probs_dom /= sum_prob_dom
edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
......@@ -61,19 +78,37 @@ class ErosionModel():
lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0,
axis=1)
# Compute new edges
hhh = np.asarray(
[(1 / self.H.size()) - ((probs_dom[ix]*sum_prob_dom)/sum_prob_dom_H) for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])
hhh[hhh < 0] = 0
new_nb_edges = hhh.sum() * self.H.size()
# Compute prob erosion
probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)])
probs_erosion[probs_erosion <= 0] = float_epsilon
print("probs_erosion",probs_erosion)
probs_erosion[probs_erosion <= 0] = 0
print("probs erosion after filter negative value",probs_erosion)
probs_erosion /= probs_erosion.sum()
print("probserosion at ",self.nb_of_erosion,"with ",np.count_nonzero(probs_erosion),"of non zero values")
# Generate new graph
edges = edges[probs_erosion > 0]
probs_erosion=probs_erosion[probs_erosion > 0]
print("EDGES for erosion", edges)
print("|E| with erosion and len(probs_ero)",len(edges),len(probs_erosion))
print("new_edges_len",round(new_nb_edges))
if new_nb_edges > len(edges):
return False
final_edges = []
index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion,
replace=False) # round(0.7*H.size())
replace=False) # round(0.7*H.size()) round(new_nb_edges)
final_edges.extend(edges[index_selected_pairs])
G2 = nx.from_edgelist(final_edges)
......@@ -148,7 +183,7 @@ class ErosionModel():
def position_str_process(G):
def foo(x):
return [eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)]
return np.array([eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)])
is_pos=True
H = G.copy()
......
......@@ -155,7 +155,7 @@ def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=1000, min_deg=0):
return G
def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b) ** 2,
def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b),
self_link=False, weighted=False):
"""
Generate a spatial graph with a specific number of vertices and edges
......@@ -196,7 +196,7 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n
for j in range(nb_nodes):
if i == j and not self_link:
continue
data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])))])
data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])**4))])
df = pd.DataFrame(data, columns="src tar weight".split()).astype({"src": int, "tar": int})
df["hash"] = df.apply(lambda x: "_".join(sorted([str(int(x.src)), str(int(x.tar))])), axis=1)
df = df.drop_duplicates(subset="hash")
......@@ -563,4 +563,8 @@ def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2):
probs.append(spat_model(n1, n2))
register.add(hash_func((n1, n2)))
return edges, probs
\ No newline at end of file
return edges, probs
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment