-
Fize Jacques authored
Add graph generator for stochastic block model, configuration model that takes nb of nodes and nb of edges in parameter + Debug
b720b5e5
utils.py 2.28 KiB
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import networkx as nx
def load_country_country_data(filename, self_link=False):
"""
Load and preprocess data
Parameters
----------
filename: str
input filename
self_link: bool
use or not self link
Returns
-------
pandas.Dataframe
data
"""
df = pd.read_csv(filename, sep="\t").fillna("NA")
ign = ["CW", "XK"] # No coords for these two countries ... got to investigate!
df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
if not self_link:
mask = df.apply(lambda x: False if x.user_loc == x.fr_loc else True, axis=1)
df = df[mask]
return df
def sample_with_pandas(df, N):
"""
Return a sample of the avalaible connection using Pandas Dataframe.sample() method
Parameters
----------
df : pandas.Dataframe
input
Returns
-------
pd.DataFrame
Selected edges
"""
if not "norm_scaled_sci" in df.columns.values:
df["norm_scaled_sci"] = df.scaled_sci / df.scaled_sci.sum()
return df.sample(n=N, weights="norm_scaled_sci").rename(columns={"norm_scaled_sci": "weight"})
def to_edgelist(sample, encoder, weight=False):
"""
Parse FB SCI dataframe to edgelist format
Parameters
----------
sample : pandas.Dataframe
dataframe
encoder : sklearn.preprocessing.LabelEncoder
encoder
weight : bool
include (or not) FB SC index in output
Returns
-------
"""
new_df = sample.copy()
new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values)
new_df["user_loc"] = encoder.transform(new_df.user_loc.values)
del new_df["scaled_sci"]
if not weight:
del new_df["weight"]
return new_df
def load_edgelist(path, weighted=False, is_directed=False, sep=","):
template = nx.Graph()
if is_directed:
template = nx.DiGraph()
if weighted:
df = pd.read_csv(path, header=None, names="source target weight".split(),sep=sep)
G = nx.from_pandas_edgelist(df, edge_attr="weight", create_using=template)
else:
df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
G = nx.from_pandas_edgelist(df, create_using=template)
return G