Skip to content
Snippets Groups Projects
utils.py 2.28 KiB
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import networkx as nx


def load_country_country_data(filename, self_link=False):
    """
    Load and preprocess data
    Parameters
    ----------
    filename: str
        input filename
    self_link: bool
        use or not self link

    Returns
    -------
    pandas.Dataframe
        data
    """
    df = pd.read_csv(filename, sep="\t").fillna("NA")
    ign = ["CW", "XK"]  #  No coords for these two countries ... got to investigate!
    df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
    if not self_link:
        mask = df.apply(lambda x: False if x.user_loc == x.fr_loc else True, axis=1)
        df = df[mask]
    return df


def sample_with_pandas(df, N):
    """
    Return a sample of the avalaible connection using Pandas Dataframe.sample() method

    Parameters
    ----------
    df : pandas.Dataframe
        input

    Returns
    -------
    pd.DataFrame
        Selected edges
    """
    if not "norm_scaled_sci" in df.columns.values:
        df["norm_scaled_sci"] = df.scaled_sci / df.scaled_sci.sum()
    return df.sample(n=N, weights="norm_scaled_sci").rename(columns={"norm_scaled_sci": "weight"})


def to_edgelist(sample, encoder, weight=False):
    """
    Parse FB SCI dataframe to edgelist format
    Parameters
    ----------
    sample : pandas.Dataframe
        dataframe
    encoder : sklearn.preprocessing.LabelEncoder
        encoder
    weight : bool
        include (or not) FB SC index in output

    Returns
    -------

    """
    new_df = sample.copy()
    new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values)
    new_df["user_loc"] = encoder.transform(new_df.user_loc.values)
    del new_df["scaled_sci"]
    if not weight:
        del new_df["weight"]
    return new_df

def load_edgelist(path, weighted=False, is_directed=False, sep=","):
    template = nx.Graph()
    if is_directed:
        template = nx.DiGraph()

    if weighted:
        df = pd.read_csv(path, header=None, names="source target weight".split(),sep=sep)
        G = nx.from_pandas_edgelist(df, edge_attr="weight", create_using=template)
    else:
        df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
        G = nx.from_pandas_edgelist(df, create_using=template)
    return G