import pandas as pd from sklearn.preprocessing import LabelEncoder import numpy as np import networkx as nx def load_country_country_data(filename, self_link=False): """ Load and preprocess data Parameters ---------- filename: str input filename self_link: bool use or not self link Returns ------- pandas.Dataframe data """ df = pd.read_csv(filename, sep="\t").fillna("NA") ign = ["CW", "XK"] # No coords for these two countries ... got to investigate! df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))] if not self_link: mask = df.apply(lambda x: False if x.user_loc == x.fr_loc else True, axis=1) df = df[mask] return df def sample_with_pandas(df, N): """ Return a sample of the avalaible connection using Pandas Dataframe.sample() method Parameters ---------- df : pandas.Dataframe input Returns ------- pd.DataFrame Selected edges """ if not "norm_scaled_sci" in df.columns.values: df["norm_scaled_sci"] = df.scaled_sci / df.scaled_sci.sum() return df.sample(n=N, weights="norm_scaled_sci").rename(columns={"norm_scaled_sci": "weight"}) def to_edgelist(sample, encoder, weight=False): """ Parse FB SCI dataframe to edgelist format Parameters ---------- sample : pandas.Dataframe dataframe encoder : sklearn.preprocessing.LabelEncoder encoder weight : bool include (or not) FB SC index in output Returns ------- """ new_df = sample.copy() new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values) new_df["user_loc"] = encoder.transform(new_df.user_loc.values) del new_df["scaled_sci"] if not weight: del new_df["weight"] return new_df def load_edgelist(path, weighted=False, is_directed=False, sep=","): template = nx.Graph() if is_directed: template = nx.DiGraph() if weighted: df = pd.read_csv(path, header=None, names="source target weight".split(),sep=sep) G = nx.from_pandas_edgelist(df, edge_attr="weight", create_using=template) else: df = pd.read_csv(path, header=None, names="source target".split(),sep=sep) G = nx.from_pandas_edgelist(df, create_using=template) return G