diff --git a/README.md b/README.md index e218ade50d3100f7bfa0435d6cbbf0d847a56498..b5900fe300dc4934518c046b1775f00ac21ac91c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,88 @@ -# linkprediction_depo +# What model enable to understand better a graph entities interactions through link prediction + +This repository contains source code to research concerning linkprediction + +# Installation + +To use the code in this repo, first you'll need to install Python requirements using the following command + +```shell +pip install -r requirements.txt +``` + +Then, install our custom version of EvalNE using pip + +```shell +pip install git+https://github.com/thunlp/OpenNE.git +pip install git+https://github.com/Jacobe2169/EvalNE +``` + +Finally, install the Python library [graph-tool](https://graph-tool.skewed.de). + + +# First experience: Does a graph model still is the best to predict itself ? + +First hand, it's important to indicate why we want to do this ! For a better understanding of a network, we +need to know what model structured it. One way of doing that is through link prediction. Our hypothesis is that +if a specific model is able to catch how most of vertices are linked then the graph is structured by it. One way +to verify this hypothesis is to evaluate a model in a link prediction task. + +In this experiment, we evaluate different link prediction methods from heuristics to deep learning methods. For that +we use a custom version of EvalNE for evaluating link prediction methods. We developed a serie of functions for generating +graph based on the following models: + +* Spatial model +* Stochastic Block Model +* Configuration Model +* Random (ER) + +## Generate a graph using the library + +All graph generator can be found in the module `lib.random`. For every graph generator, you can set the number of edges and nodes +in the resulting graph. + +For example, if you want to generate a graph following the stochastic block model, use the follwing code : +```python +from lib.random import stochastic_block_model_graph +G = stochastic_block_model_graph(nb_nodes=300,nb_edges=1200,nb_com=5,percentage_edge_betw=0.01) +``` + +If you wish to generate a dataset containing generated graph with different configurations, you can use the script +`generate_random_graph.py` using the following command : + +```shell +python generate_theoric_random_graph.py <output_dir> +``` + +You can modify the parameters of each configuration for each model in the script source code. + +## Evaluation + +To run the evaluation of link prediction method on a dataset, use the `run_eval.py` script: + +```shell +python run_eval.py <graph_datasset_dir> <output_dir> [-f <graph_file_format(gexf, gml or txt)> ][-t <train_frac>] [-v verbose] +``` + +# Second, What about mixed model graphs ? + +## Generate graph using a mixture of model + +```python +from lib.random import mixed_model_spat_sbm +G = mixed_model_spat_sbm(nb_nodes=300,nb_edges=1600,nb_com=3,alpha=0.3) +#Here alpha is to quantify fraction of the edges selected using the SBM model +``` + +## Evaluate our link prediction method on mixed model +(TODO Explain the erosion model) + + +```shell +python eval_mixed_model.py <graph_datasset_dir> <output_dir> [-f <graph_file_format(gexf, gml or txt)> ][-t <train_frac>] [-v verbose] +``` + +# Authors +Jacques Fize, Rémy Cazabet + diff --git a/generate_random_graph.py b/deprecated/generate_random_graph.py similarity index 100% rename from generate_random_graph.py rename to deprecated/generate_random_graph.py diff --git a/generate_theoric_random_graph.py b/generate_theoric_random_graph.py index a413ef69e15fe90217815a8db7abd0958b497b0f..885b99785dbdf883ca940f98d0802ca86a4990f5 100644 --- a/generate_theoric_random_graph.py +++ b/generate_theoric_random_graph.py @@ -18,7 +18,7 @@ args = parser.parse_args() GRAPH_SIZE = [80,800,5000] -EDGE_SIZE = [2,4,5,10] +EDGE_FACTOR = [2,4,5,10] # the number of edges is computed by multiplicating the edge factor with the number of nodes. sample_per_params = 10 OUTPUT_DIR = args.output_dir @@ -29,28 +29,28 @@ if not os.path.exists(OUTPUT_DIR): parameters = { "stochastic_block_model_graph": { "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE, + "nb_edges":EDGE_FACTOR, "nb_com" :[2,5,8,16,10,25], "percentage_edge_betw":[0.1,0.01] }, "ER_graph": { "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE + "nb_edges":EDGE_FACTOR }, "powerlaw_graph": { # configuration_model "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE, + "nb_edges":EDGE_FACTOR, "exponent":[2,3], "tries":[100] }, "spatial_graph":{ "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE, + "nb_edges":EDGE_FACTOR, "coords":["random","country"], }, "mixed_model_spat_sbm":{ "nb_nodes":GRAPH_SIZE, - "nb_edges":EDGE_SIZE, + "nb_edges":EDGE_FACTOR, "nb_com":[2,4,8,16], "alpha":[0,0.01,0.1,0.5,0.7,1] diff --git a/lib/erosion_model.py b/lib/erosion_model.py index 19b71c5c42a123e83ee0256746e070e6fad7eed1..ef37b8c1eefb58b48790b229ab7d41de20f50324 100644 --- a/lib/erosion_model.py +++ b/lib/erosion_model.py @@ -1,6 +1,7 @@ # coding = utf-8 from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score +from tqdm import tqdm from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges from .random import get_spat_probs, get_sbm_probs @@ -20,6 +21,18 @@ def log(x): if VERBOSE: print(x) +def probs_computation_based_on_weight(weights,n=100000): + a = np.copy(weights) + idx_vals = np.arange(len(a)) + res = np.zeros(len(a)) + for i in range(n): + idxrand = np.random.choice(idx_vals,1,p=a) + res[idxrand] = res[idxrand] + 1 + + res/=n + return res + + class ErosionModel(): def __init__(self, G): self.G = G @@ -43,7 +56,7 @@ class ErosionModel(): old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values) auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60) - if VERBOSE:print(auc_sbm,auc_spatial) + if VERBOSE:print("SBM AUC",auc_sbm,"SPATIAL AUC",auc_spatial) edges = get_all_possible_edges(self.H) if auc_sbm > auc_spatial: probs = stochastic_block_model(self.H, edges) @@ -52,8 +65,12 @@ class ErosionModel(): edges = np.asarray(edges) probs_dom = np.asarray(probs) + + probs_dom = probs_computation_based_on_weight(probs_dom/probs_dom.sum()) sum_prob_dom = probs_dom.sum() sum_prob_dom_H = sum([probs[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)]) + + #store the model probs_dom /= sum_prob_dom edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom)) @@ -61,19 +78,37 @@ class ErosionModel(): lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0, axis=1) + # Compute new edges hhh = np.asarray( [(1 / self.H.size()) - ((probs_dom[ix]*sum_prob_dom)/sum_prob_dom_H) for ix, ed in enumerate(edges) if self.H.has_edge(*ed)]) hhh[hhh < 0] = 0 new_nb_edges = hhh.sum() * self.H.size() + + + + + + # Compute prob erosion probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)]) - probs_erosion[probs_erosion <= 0] = float_epsilon + print("probs_erosion",probs_erosion) + probs_erosion[probs_erosion <= 0] = 0 + print("probs erosion after filter negative value",probs_erosion) probs_erosion /= probs_erosion.sum() + print("probserosion at ",self.nb_of_erosion,"with ",np.count_nonzero(probs_erosion),"of non zero values") + # Generate new graph + edges = edges[probs_erosion > 0] + probs_erosion=probs_erosion[probs_erosion > 0] + print("EDGES for erosion", edges) + print("|E| with erosion and len(probs_ero)",len(edges),len(probs_erosion)) + print("new_edges_len",round(new_nb_edges)) + if new_nb_edges > len(edges): + return False final_edges = [] index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion, - replace=False) # round(0.7*H.size()) + replace=False) # round(0.7*H.size()) round(new_nb_edges) final_edges.extend(edges[index_selected_pairs]) G2 = nx.from_edgelist(final_edges) @@ -148,7 +183,7 @@ class ErosionModel(): def position_str_process(G): def foo(x): - return [eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)] + return np.array([eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)]) is_pos=True H = G.copy() diff --git a/lib/random.py b/lib/random.py index 9347d95acd5c7089df5eea2a03a708da48e6120b..2f439a512fd172e5b4982fbd073a4c5a41fa92f2 100644 --- a/lib/random.py +++ b/lib/random.py @@ -155,7 +155,7 @@ def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=1000, min_deg=0): return G -def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b) ** 2, +def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b), self_link=False, weighted=False): """ Generate a spatial graph with a specific number of vertices and edges @@ -196,7 +196,7 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n for j in range(nb_nodes): if i == j and not self_link: continue - data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])))]) + data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])**4))]) df = pd.DataFrame(data, columns="src tar weight".split()).astype({"src": int, "tar": int}) df["hash"] = df.apply(lambda x: "_".join(sorted([str(int(x.src)), str(int(x.tar))])), axis=1) df = df.drop_duplicates(subset="hash") @@ -563,4 +563,8 @@ def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2): probs.append(spat_model(n1, n2)) register.add(hash_func((n1, n2))) - return edges, probs \ No newline at end of file + return edges, probs + + + +