generate_random_graph.py

import numpy as np

from joblib import dump
import networkx as nx
from sklearn.preprocessing import LabelEncoder

from lib.utils import load_country_country_data, sample_with_pandas, to_edgelist

import argparse
import os

parser = argparse.ArgumentParser()

parser.add_argument("input_tsv")
parser.add_argument("output_dir")
parser.add_argument('-d', '--dimensions', help='Size of generated graph', 
    type=lambda s: [int(item) for item in s.split(',')],default=[50,100,200,500,1000])
parser.add_argument("-n",type=int,help="Number of graph generated per size",default=6)
parser.add_argument("-s","--self-link",action="store_true")

args = parser.parse_args()

if not os.path.exists(args.output_dir):
    print("Output Dir does not exists !")

# Load the data
df = load_country_country_data(args.input_tsv,self_link=args.self_link)
df["hash"] = df.apply(lambda row:"_".join(sorted([row.user_loc,row.fr_loc])),axis=1)
df = df.drop_duplicates(subset=['hash'])
# Normalise the sci index
df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()

encoder = LabelEncoder()
encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))

for i in range(args.n): # For a number of graph
    for size in args.dimensions: # Per size
        test = sample_with_pandas(df,size) # sample edges using the normalised FB social interconnectedness indew
        G = nx.from_pandas_edgelist(test, source="user_loc",target="fr_loc", edge_attr="weight", create_using=nx.Graph())
        nx.write_gml(G,args.output_dir + "/fb_country_country_sample_{0}_size{1}.gml".format(i, size))
        #output_df = to_edgelist(test,encoder,weight=True) # Parse to edgelist format
        #output_df.to_csv(args.output_dir + "/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",") # Save the output

# Save encoder to reverse the label transformation
#dump(encoder,args.output_dir + "/encoder.joblib")