From 0ccb0c4979221ebddeb9d0c604eb7561f4cf842f Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Tue, 2 Feb 2021 10:44:03 +0100 Subject: [PATCH] add multithread run_eval script + debug split graph error --- evalNE_script.py | 5 +++- run_eval_par.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 run_eval_par.py diff --git a/evalNE_script.py b/evalNE_script.py index 7c1ceef..7f29050 100644 --- a/evalNE_script.py +++ b/evalNE_script.py @@ -40,7 +40,10 @@ log("Density "+ str(len(G)/len(list(G.edges())))) log("Building link prediction dataset...") # Create an evaluator and generate train/test edge split traintest_split = LPEvalSplit() -traintest_split.compute_splits(G,split_alg="fast",train_frac=0.6,fe_ratio=1) +try: + traintest_split.compute_splits(G,split_alg="fast",train_frac=0.6,fe_ratio=1) +except ValueError: + traintest_split.compute_splits(G, split_alg="spanning_tree", train_frac=0.6, fe_ratio=1) nee = LPEvaluator(traintest_split) log("Dataset Built !") diff --git a/run_eval_par.py b/run_eval_par.py new file mode 100644 index 0000000..16e4a1f --- /dev/null +++ b/run_eval_par.py @@ -0,0 +1,59 @@ +# coding = utf-8 + +import glob +import subprocess +from lib.helpers import parse_evalne_output +from lib.utils import load_edgelist +import os +import pandas as pd +from tqdm import tqdm +import networkx as nx +from joblib import Parallel,delayed + +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("dataset_dir") +parser.add_argument("output_filename") +parser.add_argument("-f", "--format", default="gexf", choices=["gexf", "gml", "txt"]) + +args = parser.parse_args() +fns = sorted(glob.glob(args.dataset_dir + "/*." + args.format)) + +def run_eval(fn): + command = "python evalNE_script.py {0} -f {1} -n".format(fn, args.format).split() + output = subprocess.run(command) + if not output.returncode == 0: + print("Error! for the command :", " ".join(command)) + +all_res = [] + +# Run link prediction +Parallel(n_jobs=4,backend="multiprocessing")(delayed(run_eval)(fn) for fn in tqdm(fns)) + +pbar = tqdm(fns) +for fn in pbar: + pbar.set_description("compile eval from "+ fn) + + if os.path.exists(fn + "_results_lp"): + df_results = parse_evalne_output(open(fn + "_results_lp").read()) + name = os.path.basename(fn) + G = None + if args.format == "edgelist": + G = load_edgelist(path=fn) + elif args.format == "gml": + G = nx.read_gml(fn) + else: + G = nx.read_gexf(fn) + + top10node = pd.DataFrame(list(G.degree()), columns="node degree".split()).sort_values("degree", + ascending=False).head(10).node.values + df_results["nb_edge"] = G.size() + df_results["transitivity"] = nx.transitivity(G) + df_results["density"] = nx.density(G) + df_results["top10_node"] = "|".join(top10node) + df_results["size"] = len(G) + df_results["filename"] = name + all_res.append(df_results) + +pd.concat(all_res).to_csv(args.output_filename, sep="\t", index=False) -- GitLab