Newer
Older
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
def load_country_country_data(filename, self_link=False):
"""
Load and preprocess data
Parameters
----------
filename: str
input filename
self_link: bool
use or not self link
Returns
-------
pandas.Dataframe
data
"""
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
ign = ["CW", "XK"] # No coords for these two countries ... got to investigate!
df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
if not self_link:
mask = df.apply(lambda x: False if x.user_loc == x.fr_loc else True, axis=1)
df = df[mask]
return df
def sample_with_pandas(df, N):
"""
Return a sample of the avalaible connection using Pandas Dataframe.sample() method
Parameters
----------
df : pandas.Dataframe
input
Returns
-------
pd.DataFrame
Selected edges
"""
if not "norm_scaled_sci" in df.columns.values:
df["norm_scaled_sci"] = df.scaled_sci / df.scaled_sci.sum()
return df.sample(n=N, weights="norm_scaled_sci").rename(columns={"norm_scaled_sci": "weight"})
def to_edgelist(sample, encoder, weight=False):
"""
Parse FB SCI dataframe to edgelist format
Parameters
----------
sample : pandas.Dataframe
dataframe
encoder : sklearn.preprocessing.LabelEncoder
encoder
weight : bool
include (or not) FB SC index in output
Returns
-------
"""
new_df = sample.copy()
new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values)
new_df["user_loc"] = encoder.transform(new_df.user_loc.values)
del new_df["scaled_sci"]
if not weight:
del new_df["weight"]
return new_df
def load_edgelist(path, weighted=False, is_directed=False, sep=","):
template = nx.Graph()
if is_directed:
template = nx.DiGraph()
if weighted:
df = pd.read_csv(path, header=None, names="source target weight".split(),sep=sep)
G = nx.from_pandas_edgelist(df, edge_attr="weight", create_using=template)
else:
df = pd.read_csv(path, header=None, names="source target".split(),sep=sep)
G = nx.from_pandas_edgelist(df, create_using=template)
return G