Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
def load_country_country_data(filename,self_link=False):
df = pd.read_csv(filename,sep="\t")
df = df[(~df.user_loc.isna()) & (~df.fr_loc.isna())]
ign = ["CW","XK"] # No coords for these two countries ... got to investigate!
df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
if not self_link:
mask = df.apply(lambda x:False if x.user_loc ==x.fr_loc else True,axis=1)
df = df[mask]
return df
def sample_with_pandas(df,N):
"""
Return a sample of the avalaible connection using Pandas Dataframe.sample() method
Parameters
----------
df : pd.Dataframe
input
Returns
-------
pd.DataFrame
Selected edges
"""
if not "norm_scaled_sci" in df.columns.values:
df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()
return df.sample(n=N,weights="norm_scaled_sci").rename(columns={"norm_scaled_sci":"weight"})
def to_edgelist(sample,encoder,weight=False):
new_df = sample.copy()
new_df["fr_loc"] = encoder.transform(new_df.fr_loc.values)
new_df["user_loc"] = encoder.transform(new_df.user_loc.values)
del new_df["scaled_sci"]
if not weight:
del new_df["weight"]
return new_df