Newer
Older
from sklearn.preprocessing import LabelEncoder
from lib.utils import load_country_country_data, sample_with_pandas, to_edgelist
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("input_tsv")
parser.add_argument("output_dir")
parser.add_argument('-d', '--dimensions', help='Size of generated graph',
type=lambda s: [int(item) for item in s.split(',')],default=[50,100,200,500,1000])
parser.add_argument("-n",type=int,help="Number of graph generated per size",default=6)
parser.add_argument("-s","--self-link",action="store_true")
args = parser.parse_args()
if not os.path.exists(args.output_dir):
print("Output Dir does not exists !")
# Load the data
df = load_country_country_data(args.input_tsv,self_link=args.self_link)
df["hash"] = df.apply(lambda row:"_".join(sorted([row.user_loc,row.fr_loc])),axis=1)
df = df.drop_duplicates(subset=['hash'])
# Normalise the sci index
df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()
encoder = LabelEncoder()
encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))
for i in range(args.n): # For a number of graph
for size in args.dimensions: # Per size
test = sample_with_pandas(df,size) # sample edges using the normalised FB social interconnectedness indew
G = nx.from_pandas_edgelist(test, source="user_loc",target="fr_loc", edge_attr="weight", create_using=nx.Graph())
nx.write_gml(G,args.output_dir + "/fb_country_country_sample_{0}_size{1}.gml".format(i, size))
#output_df = to_edgelist(test,encoder,weight=True) # Parse to edgelist format
#output_df.to_csv(args.output_dir + "/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",") # Save the output
# Save encoder to reverse the label transformation