debug and add helpers and utils

8376d2c5 · Fize Jacques · b9802aa3 · 8376d2c5 · 8376d2c5 · 8376d2c5
Commit 8376d2c5 authored 4 years ago by Fize Jacques
--- a/.gitignore
+++ b/.gitignore
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+*.DS_Store*
+.idea*
\ No newline at end of file
--- a/draw_graph.ipynb
+++ b/draw_graph.ipynb
--- a/draw_graph_script.py
+++ b/draw_graph_script.py
@@ -3,25 +3,26 @@
 import argparse
 import networkx as nx
 import pandas as pd
-import joblib,json
+import joblib
+import json

 from lib.draw import draw

 parser = argparse.ArgumentParser()
 parser.add_argument("input_file",help="edgelist format (sep = \",\" )")
+parser.add_argument("output_file")
 parser.add_argument("--encoder-file",help="LabelEncoder instance that allows to obtain a label for each node")
 parser.add_argument("--country",help="if country node",action="store_true")
 parser.add_argument("-w",action="store_true")
-parser.add_argument("output_file")

 args = parser.parse_args()

 if args.w:
    df = pd.read_csv(args.input_file,header=None,names="source target weight".split())
-    G = nx.from_pandas_edgelist(df,edge_attr="weight")
+    G = nx.from_pandas_edgelist(df,edge_attr="weight",create_using=nx.DiGraph())
 else:
-    df = pd.read_csv(args.input_file, header=None, names="source target weight".split())
-    G = nx.from_pandas_edgelist(df, edge_attr="weight")
+    df = pd.read_csv(args.input_file, header=None, names="source target".split())
+    G = nx.from_pandas_edgelist(df,create_using=nx.DiGraph())

 encoder = None
 labels_dict = {}
@@ -37,4 +38,4 @@ if args.encoder_file:
            labels_dict[node] = encoder.inverse_transform([node])[0]

 fig, ax = draw(G,labels_dict)
-fig.savefig("test.pdf")
\ No newline at end of file
+fig.savefig(args.output_file)
\ No newline at end of file
--- a/evalNE_script.py
+++ b/evalNE_script.py
@@ -3,15 +3,18 @@ from evalne.evaluation.split import EvalSplit as LPEvalSplit
 from evalne.evaluation.score import Scoresheet
 from evalne.utils import preprocess as pp

+from lib.utils import load_edgelist
+
 import argparse

 parser = argparse.ArgumentParser()
 parser.add_argument("edgelist_graph_filename")
+parser.add_argument("-v","--verbose",action="store_true")

 args = parser.parse_args()#("data/fb_country_country_sample_6_size1000.txt".split())

 # Load and preprocess the network
-G = pp.load_graph(args.edgelist_graph_filename,directed=True)
+G = load_edgelist(args.edgelist_graph_filename,is_directed=True,weighted=True)
 G, _ = pp.prep_graph(G,maincc=True)

 # Create an evaluator and generate train/test edge split
@@ -33,7 +36,7 @@ methods = ['random_prediction',

 # Evaluate baselines
 for method in methods:
-    result = nee.evaluate_baseline(method=method)
+    result = nee.evaluate_baseline(method=method, )
    scoresheet.log_results(result)

 try:
@@ -58,7 +61,7 @@ try:
    for i in range(len(methods)):
        command = commands[i] + " --input {} --output {} --representation-size {}"
        results = nee.evaluate_cmd(method_name=methods[i], method_type='ne', command=command,
-                                   edge_embedding_methods=edge_emb, input_delim=' ', output_delim=' ')
+                                   edge_embedding_methods=edge_emb, input_delim=' ', output_delim=' ',  verbose=args.verbose)
        scoresheet.log_results(results)

 except ImportError:
@@ -66,5 +69,6 @@ except ImportError:
    pass

 # Get output
-scoresheet.print_tabular()
+if args.verbose:
+    scoresheet.print_tabular()
 scoresheet.write_all(args.edgelist_graph_filename+"_results_lp")
\ No newline at end of file
--- a/lib/draw.py
+++ b/lib/draw.py
 import matplotlib.pyplot as plt
 import matplotlib.patheffects as path_effects
+import seaborn as sns
 import networkx as nx
 import pandas as pd
 import numpy as np
+from glob import glob

 from fa2 import ForceAtlas2


 def get_force_atlas(weight_influence=0, scaling_ratio=3.0, gravity=5):
+    """
+    Return an instance of ForceAtlas with a specific configuration
+    Parameters
+    ----------
+    weight_influence: float
+        between 0 and 1 (default 0)
+    scaling_ratio : float or int
+        see fa2 documentation(default 3)
+    gravity : float or int
+        see fa2 documentation (default 5)
+
+    Returns
+    -------
+    ForceAtlas2
+        instance of ForceAtlas2
+    """
    forceatlas2 = ForceAtlas2(
        # Behavior alternatives
        outboundAttractionDistribution=True,  # Dissuade hubs
@@ -32,20 +50,29 @@ def get_force_atlas(weight_influence=0, scaling_ratio=3.0, gravity=5):


 def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_size=12, stroke_width=3,
-         stroke_color="black", font_color="white", edge_cmap=plt.cm.viridis, weight = True):
+         stroke_color="black", font_color="white", edge_cmap=plt.cm.viridis, weight=True):
    """
-    Return a figure of the current graph
+    Return a figure of a NetworkX graph
    Parameters
    ----------
-    G
-    labels_dict
-    iteration_force_atlase
-    figsize
-    font_size
-    stroke_width
-    stroke_color
-    font_color
-    edge_cmap
+    G : nx.Graph
+        graph instance
+    labels_dict: dict
+        label for each node id
+    iteration_force_atlase: int
+        nb of iteration for the Force Atlas algorithm
+    figsize: tuple
+        figure size (matplotlib)
+    font_size: int
+        font size
+    stroke_width : int
+        text contour size
+    stroke_color: str
+        text contour color
+    font_color : str
+        text color
+    edge_cmap: matplotlib.pyplot.cm
+        Matplotlib Colormap instance used when edges are associated with a weight

    Returns
    -------
@@ -63,8 +90,8 @@ def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_
    fig, ax = plt.subplots(1, figsize=figsize)

    # Draw nodes
-    nx.draw_networkx_nodes(G, positions, node_color='#999', ax=ax)
-
+    nodes = nx.draw_networkx_nodes(G, positions, node_color='#999', ax=ax)
+    edges = None
    # Draw edges
    if weight:
        weights_width = [G[u][v]['weight'] * 200 for u, v in list(G.edges())]
@@ -72,7 +99,7 @@ def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_
        edges = nx.draw_networkx_edges(G, positions, edge_color=colors, width=weights_width,
                                       edge_cmap=edge_cmap, ax=ax)
    else:
-        edges = nx.draw_networkx_edges(G, positions, ax=ax,edge_color="#999")
+        edges = nx.draw_networkx_edges(G, positions, ax=ax, edge_color="#999")

    # Plot nodes label
    for node, pos in positions.items():
@@ -87,6 +114,38 @@ def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_
                               path_effects.Normal()])  # effet de style
    # Plot colorbar
    if weight:
-        plt.colorbar(edges)
+        sm = plt.cm.ScalarMappable(cmap=edge_cmap, norm=plt.Normalize(vmin=min(colors), vmax=max(colors)))
+        sm.set_array([])
+        fig.colorbar(sm)
    plt.axis("off")
+    plt.tight_layout()
+    return fig, ax
+
+
+def average_degree(graph_dir, ext=".txt"):
+    """
+    Produce a figure that shows the average degree per number of edges in a graph dataset.
+    Parameters
+    ----------
+    graph_dir: str
+        graph dataset directory path
+    ext : str
+        extension of the graph file (must be edgelist format)
+
+    Returns
+    -------
+        Figure, AxesSubplot
+    """
+    plt.gcf()
+    fns = glob(graph_dir + "/*" + ext)
+    data = []
+    for fn in fns:
+        df = pd.read_csv(fn, header=None, names="source target".split())
+        G = nx.from_pandas_edgelist(df, create_using=nx.DiGraph())
+        degree_values = np.asarray(list(G.degree()))[:, 1]
+        data.append([len(list(G.edges())), degree_values.mean()])
+    df = pd.DataFrame(data, columns="nb_edges avg_degree".split())
+    fig, ax = plt.subplots(1, figsize=(10, 5))
+    ax = sns.scatterplot(data=df, x="nb_edges", y="avg_degree", hue="nb_edges", legend=False, ax=ax)
+    ax.set(xlabel="Number of edges", ylabel="Average Degree")
    return fig, ax
--- a/lib/helpers.py
+++ b/lib/helpers.py
 import pandas as pd
-import geopandas as gpd
 import numpy as np
 import networkx as nx
-import graph_tool as gt
+import os
+
+try:
+    import graph_tool as gt
+except:
+    pass
+
+def parse_evalne_output(string):
+    def foo(x):
+        try:
+            return eval(x)
+        except:
+            return x
+    modif = string.split("---------------------------")[-1]
+    results = modif.split("\n \n")
+    logs = []
+    for log in results:
+        log = log.strip().split("\n")
+        name, data = log[0].strip(":"), log[1:]
+        data = [d.split("\t") for d in data]
+        data = [[i.strip().strip(":") for i in d] for d in data]
+        data = dict([[d[0], foo(d[1])] for d in data])
+        data["name"] =name
+        logs.append(data)
+    return pd.DataFrame.from_records(logs)


 def get_centroid(gdf,key_id):

--- a/lib/utils.py
+++ b/lib/utils.py
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 import numpy as np
+import networkx as nx


 def load_country_country_data(filename, self_link=False):
@@ -70,3 +71,16 @@ def to_edgelist(sample, encoder, weight=False):
    if not weight:
        del new_df["weight"]
    return new_df
+
+def load_edgelist(path, weighted=False, is_directed=False, sep=","):
+    template = nx.Graph()
+    if is_directed:
+        template = nx.DiGraph()
+
+    if weighted:
+        df = pd.read_csv(path, header=None, names="source target weight".split(),sep=sep)
+        G = nx.from_pandas_edgelist(df, edge_attr="weight", create_using=template)
+    else:
+        df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
+        G = nx.from_pandas_edgelist(df, create_using=template)
+    return G
--- a/requirements.txt
+++ b/requirements.txt
 pandas
 numpy
 sklearn
+seaborn