debug and start to write Readme

c23a6cce · Fize Jacques · 635dffad · c23a6cce · c23a6cce · c23a6cce
Commit c23a6cce authored 4 years ago by Fize Jacques
--- a/README.md
+++ b/README.md
-# linkprediction_depo
+# What model enable to understand better a graph entities interactions through link prediction
+
+This repository contains source code to research concerning linkprediction
+
+# Installation
+
+To use the code in this repo, first you'll need to install Python requirements using the following command
+
+```shell
+pip install -r requirements.txt
+```
+
+Then, install our custom version of EvalNE using pip
+
+```shell
+pip install git+https://github.com/thunlp/OpenNE.git
+pip install git+https://github.com/Jacobe2169/EvalNE
+```
+
+Finally, install the Python library [graph-tool](https://graph-tool.skewed.de).
+
+
+# First experience: Does a graph model still is the best to predict itself ?
+
+First hand, it's important to indicate why we want to do this ! For a better understanding of a network, we
+need to know what model structured it. One way of doing that is through link prediction. Our hypothesis is that
+if a specific model is able to catch how most of vertices are linked then the graph is structured by it. One way
+to verify this hypothesis is to evaluate a model in a link prediction task.
+
+In this experiment, we evaluate different link prediction methods from heuristics to deep learning methods. For that 
+we use a custom version of EvalNE for evaluating link prediction methods. We developed a serie of functions for generating
+graph based on the following models:
+
+* Spatial model 
+* Stochastic Block Model
+* Configuration Model
+* Random (ER)
+
+## Generate a graph using the library
+
+All graph generator can be found in the module `lib.random`. For every graph generator, you can set the number of edges and nodes 
+in the resulting graph.
+
+For example, if you want to generate a graph following the stochastic block model, use the follwing code :
+```python
+from lib.random import stochastic_block_model_graph
+G = stochastic_block_model_graph(nb_nodes=300,nb_edges=1200,nb_com=5,percentage_edge_betw=0.01)
+```
+
+If you wish to generate a dataset containing generated graph with different configurations, you can use the script
+`generate_random_graph.py` using the following command :
+
+```shell
+python generate_theoric_random_graph.py <output_dir>
+```
+
+You can modify the parameters of each configuration for each model in the script source code.
+
+## Evaluation
+
+To run the evaluation of link prediction method on a dataset, use the `run_eval.py` script:
+
+```shell
+python run_eval.py <graph_datasset_dir> <output_dir> [-f <graph_file_format(gexf, gml or txt)> ][-t <train_frac>] [-v verbose] 
+```
+
+# Second, What about mixed model graphs ?
+
+## Generate graph using a mixture of model
+
+```python
+from lib.random import mixed_model_spat_sbm
+G = mixed_model_spat_sbm(nb_nodes=300,nb_edges=1600,nb_com=3,alpha=0.3) 
+#Here alpha is to quantify fraction of the edges selected using the SBM model
+```
+
+## Evaluate our link prediction method on mixed model
+(TODO Explain the erosion model) 
+
+
+```shell
+python eval_mixed_model.py <graph_datasset_dir> <output_dir> [-f <graph_file_format(gexf, gml or txt)> ][-t <train_frac>] [-v verbose]
+```
+
+# Authors
+Jacques Fize, Rémy Cazabet
+

--- a/generate_random_graph.py
+++ b/generate_random_graph.py
--- a/generate_theoric_random_graph.py
+++ b/generate_theoric_random_graph.py
@@ -18,7 +18,7 @@ args = parser.parse_args()


 GRAPH_SIZE = [80,800,5000]
-EDGE_SIZE = [2,4,5,10]
+EDGE_FACTOR = [2,4,5,10] # the number of edges is computed by multiplicating the edge factor with the number of nodes.
 sample_per_params  = 10

 OUTPUT_DIR = args.output_dir
@@ -29,28 +29,28 @@ if not os.path.exists(OUTPUT_DIR):
 parameters = {
    "stochastic_block_model_graph": {
        "nb_nodes":GRAPH_SIZE,
-        "nb_edges":EDGE_SIZE,
+        "nb_edges":EDGE_FACTOR,
        "nb_com" :[2,5,8,16,10,25],
        "percentage_edge_betw":[0.1,0.01]
    },
    "ER_graph": {
        "nb_nodes":GRAPH_SIZE,
-        "nb_edges":EDGE_SIZE
+        "nb_edges":EDGE_FACTOR
    },
    "powerlaw_graph": {  # configuration_model
        "nb_nodes":GRAPH_SIZE,
-        "nb_edges":EDGE_SIZE,
+        "nb_edges":EDGE_FACTOR,
        "exponent":[2,3],
        "tries":[100]
    },
    "spatial_graph":{
        "nb_nodes":GRAPH_SIZE,
-        "nb_edges":EDGE_SIZE,
+        "nb_edges":EDGE_FACTOR,
        "coords":["random","country"],
    },
    "mixed_model_spat_sbm":{
        "nb_nodes":GRAPH_SIZE,
-        "nb_edges":EDGE_SIZE,
+        "nb_edges":EDGE_FACTOR,
        "nb_com":[2,4,8,16],
        "alpha":[0,0.01,0.1,0.5,0.7,1]


--- a/lib/erosion_model.py
+++ b/lib/erosion_model.py
 # coding = utf-8
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_auc_score
+from tqdm import tqdm

 from .link_prediction_eval import get_auc_heuristics, split_train_test, get_all_possible_edges
 from .random import get_spat_probs, get_sbm_probs
@@ -20,6 +21,18 @@ def log(x):
    if VERBOSE:
        print(x)

+def probs_computation_based_on_weight(weights,n=100000):
+    a = np.copy(weights)
+    idx_vals = np.arange(len(a))
+    res = np.zeros(len(a))
+    for i in range(n):
+        idxrand = np.random.choice(idx_vals,1,p=a)
+        res[idxrand] = res[idxrand] + 1
+
+    res/=n
+    return res
+
+
 class ErosionModel():
    def __init__(self, G):
        self.G = G
@@ -43,7 +56,7 @@ class ErosionModel():
        old_probs = dict(self.probs_df["hash_ p_{0}".format(self.nb_of_erosion - 1).split()].values)

        auc_sbm, auc_spatial = get_auc_heuristics(self.H, 60)
-        if VERBOSE:print(auc_sbm,auc_spatial)
+        if VERBOSE:print("SBM AUC",auc_sbm,"SPATIAL AUC",auc_spatial)
        edges = get_all_possible_edges(self.H)
        if auc_sbm > auc_spatial:
            probs = stochastic_block_model(self.H, edges)
@@ -52,8 +65,12 @@ class ErosionModel():

        edges = np.asarray(edges)
        probs_dom = np.asarray(probs)
+
+        probs_dom = probs_computation_based_on_weight(probs_dom/probs_dom.sum())
        sum_prob_dom = probs_dom.sum()
        sum_prob_dom_H = sum([probs[ix] for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])
+
+        #store the model
        probs_dom /= sum_prob_dom

        edge_prob = dict(zip([hash_func(ed) for ed in edges], probs_dom))
@@ -61,19 +78,37 @@ class ErosionModel():
            lambda x: edge_prob[hash_func([int(x.u), int(x.v)])] if hash_func([int(x.u), int(x.v)]) in edge_prob else 0,
            axis=1)

+        # Compute new edges
        hhh = np.asarray(
            [(1 / self.H.size()) - ((probs_dom[ix]*sum_prob_dom)/sum_prob_dom_H) for ix, ed in enumerate(edges) if self.H.has_edge(*ed)])
        hhh[hhh < 0] = 0
        new_nb_edges = hhh.sum() * self.H.size()

+
+
+
+
+
+        # Compute prob erosion
        probs_erosion = np.asarray([old_probs[hash_func(ed)] - probs_dom[ix] for ix, ed in enumerate(edges)])
-        probs_erosion[probs_erosion <= 0] = float_epsilon
+        print("probs_erosion",probs_erosion)
+        probs_erosion[probs_erosion <= 0] = 0
+        print("probs erosion after filter negative value",probs_erosion)
        probs_erosion /= probs_erosion.sum()
+        print("probserosion at ",self.nb_of_erosion,"with ",np.count_nonzero(probs_erosion),"of non zero values")

+        # Generate new graph
+        edges = edges[probs_erosion > 0]
+        probs_erosion=probs_erosion[probs_erosion > 0]
+        print("EDGES for erosion", edges)
+        print("|E| with erosion and len(probs_ero)",len(edges),len(probs_erosion))
+        print("new_edges_len",round(new_nb_edges))

+        if new_nb_edges > len(edges):
+            return False
        final_edges = []
        index_selected_pairs = np.random.choice(np.arange(len(edges)), round(new_nb_edges), p=probs_erosion,
-                                                replace=False)  # round(0.7*H.size())
+                                                replace=False)  # round(0.7*H.size()) round(new_nb_edges)
        final_edges.extend(edges[index_selected_pairs])

        G2 = nx.from_edgelist(final_edges)
@@ -148,7 +183,7 @@ class ErosionModel():

 def position_str_process(G):
    def foo(x):
-        return [eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)]
+        return np.array([eval(f) for f in re.findall("[-]?\d+.[-]?[\de+-]+", x)])

    is_pos=True
    H = G.copy()

--- a/lib/random.py
+++ b/lib/random.py
@@ -155,7 +155,7 @@ def powerlaw_graph(nb_nodes, nb_edges, exponent=2, tries=1000, min_deg=0):
    return G


-def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b) ** 2,
+def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: np.linalg.norm(a - b),
                  self_link=False, weighted=False):
    """
    Generate a spatial graph with a specific number of vertices and edges
@@ -196,7 +196,7 @@ def spatial_graph(nb_nodes, nb_edges, coords="country", dist_func=lambda a, b: n
        for j in range(nb_nodes):
            if i == j and not self_link:
                continue
-            data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])))])
+            data.append([i, j, 1 / (float_epsilon+(dist_func(coords[i], coords[j])**4))])
    df = pd.DataFrame(data, columns="src tar weight".split()).astype({"src": int, "tar": int})
    df["hash"] = df.apply(lambda x: "_".join(sorted([str(int(x.src)), str(int(x.tar))])), axis=1)
    df = df.drop_duplicates(subset="hash")
@@ -563,4 +563,8 @@ def get_spat_probs(G,dist = lambda a,b : np.linalg.norm(a-b)**2):
                probs.append(spat_model(n1, n2))
                register.add(hash_func((n1, n2)))

-    return edges, probs
\ No newline at end of file
+    return edges, probs
+
+
+
+