Skip to content
Snippets Groups Projects
utils.py 2.28 KiB
Newer Older
Fize Jacques's avatar
Fize Jacques committed
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import networkx as nx
Fize Jacques's avatar
Fize Jacques committed


def load_country_country_data(filename, self_link=False):
    """
    Load and preprocess data
    Parameters
    ----------
    filename: str
        input filename
    self_link: bool
        use or not self link

    Returns
    -------
    pandas.Dataframe
        data
    """
Fize Jacques's avatar
Fize Jacques committed
    df = pd.read_csv(filename, sep="\t").fillna("NA")
Fize Jacques's avatar
Fize Jacques committed
    ign = ["CW", "XK"]  #  No coords for these two countries ... got to investigate!
    df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
    if not self_link:
        mask = df.apply(lambda x: False if x.user_loc == x.fr_loc else True, axis=1)
        df = df[mask]
    return df


def sample_with_pandas(df, N):
    """
    Return a sample of the avalaible connection using Pandas Dataframe.sample() method

    Parameters
    ----------
    df : pandas.Dataframe
        input

    Returns
    -------
    pd.DataFrame
        Selected edges
    """
    if not "norm_scaled_sci" in df.columns.values:
        df["norm_scaled_sci"] = df.scaled_sci / df.scaled_sci.sum()
    return df.sample(n=N, weights="norm_scaled_sci").rename(columns={"norm_scaled_sci": "weight"})


def to_edgelist(sample, encoder, weight=False):
    """
    Parse FB SCI dataframe to edgelist format
    Parameters
    ----------
    sample : pandas.Dataframe
        dataframe
    encoder : sklearn.preprocessing.LabelEncoder
        encoder
    weight : bool
        include (or not) FB SC index in output

    Returns
    -------

    """
    new_df = sample.copy()
    new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values)
    new_df["user_loc"] = encoder.transform(new_df.user_loc.values)
    del new_df["scaled_sci"]
    if not weight:
        del new_df["weight"]
    return new_df

def load_edgelist(path, weighted=False, is_directed=False, sep=","):
    template = nx.Graph()
    if is_directed:
        template = nx.DiGraph()

    if weighted:
        df = pd.read_csv(path, header=None, names="source target weight".split(),sep=sep)
        G = nx.from_pandas_edgelist(df, edge_attr="weight", create_using=template)
    else:
        df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
        G = nx.from_pandas_edgelist(df, create_using=template)
    return G