Skip to content
Snippets Groups Projects
utils.py 1.28 KiB
Newer Older
Fize Jacques's avatar
Fize Jacques committed

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

def load_country_country_data(filename,self_link=False):
    df = pd.read_csv(filename,sep="\t")
    df = df[(~df.user_loc.isna()) & (~df.fr_loc.isna())]
    ign = ["CW","XK"] # No coords for these two countries ... got to investigate!
    df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
    if not self_link:
        mask = df.apply(lambda x:False if x.user_loc ==x.fr_loc else True,axis=1)
        df = df[mask]
    return df


def sample_with_pandas(df,N):
    """
    Return a sample of the avalaible connection using Pandas Dataframe.sample() method

    Parameters
    ----------
    df : pd.Dataframe
        input

    Returns
    -------
    pd.DataFrame
        Selected edges
    """
    if not "norm_scaled_sci" in df.columns.values:
        df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()
    return df.sample(n=N,weights="norm_scaled_sci").rename(columns={"norm_scaled_sci":"weight"})


def to_edgelist(sample,encoder,weight=False):
    new_df = sample.copy()
    new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values)
    new_df["user_loc"] = encoder.transform(new_df.user_loc.values)
    del new_df["scaled_sci"]
    if not weight:
        del new_df["weight"]
    return new_df