Update

d4e8ff00 · Fize Jacques · f8b664ba · f8b664ba · f8b664ba · f8b664ba
Commit d4e8ff00 authored 4 years ago by Fize Jacques
--- a/FBScocialCorrectedness.ipynb
+++ b/FBScocialCorrectedness.ipynb
--- a/FB_Analysis2.ipynb
+++ b/FB_Analysis2.ipynb
--- a/draw_graph.ipynb
+++ b/draw_graph.ipynb
--- a/draw_graph_script.py
+++ b/draw_graph_script.py
@@ -5,37 +5,38 @@ import networkx as nx
 import pandas as pd
 import joblib
 import json
+import geopandas as gpd

 from lib.draw import draw

 parser = argparse.ArgumentParser()
 parser.add_argument("input_file",help="edgelist format (sep = \",\" )")
 parser.add_argument("output_file")
-parser.add_argument("--encoder-file",help="LabelEncoder instance that allows to obtain a label for each node")
 parser.add_argument("--country",help="if country node",action="store_true")
 parser.add_argument("-w",action="store_true")

 args = parser.parse_args()

-if args.w:
-    df = pd.read_csv(args.input_file,header=None,names="source target weight".split())
-    G = nx.from_pandas_edgelist(df,edge_attr="weight",create_using=nx.DiGraph())
-else:
-    df = pd.read_csv(args.input_file, header=None, names="source target".split())
-    G = nx.from_pandas_edgelist(df,create_using=nx.DiGraph())
+G = nx.read_gexf(args.input_file)

 encoder = None
 labels_dict = {}
+positions = {}

-if args.encoder_file:
-    encoder = joblib.load(args.encoder_file)
+if args.country:
+    iso2_name = json.load(open("data/ISO3166-1.alpha2.json.txt"))
+    world = gpd.read_file("data/TM_WORLD_BORDERS-0/TM_WORLD_BORDERS-0.3.shp")
+    world["centroid_c"] = world.centroid
+    iso2_togeom = dict(world["ISO2 centroid_c".split()].values)
+    positions = {k: [v.x, v.y] for k, v in iso2_togeom.items() if k in G}
+
+for node in list(G.nodes()):
    if args.country:
-        iso2_name = json.load(open("data/ISO3166-1.alpha2.json.txt"))
-    for node in list(G.nodes()):
-        if args.country:
-            labels_dict[node] = iso2_name[encoder.inverse_transform([node])[0]]
-        else:
-            labels_dict[node] = encoder.inverse_transform([node])[0]
-
-fig, ax = draw(G,labels_dict)
+        labels_dict[node] = iso2_name[node]
+    else:
+        labels_dict[node] = node
+
+fig, ax = draw(G,labels_dict,positions)
+if args.country:
+    world.boundary.plot(ax=ax)
 fig.savefig(args.output_file)
\ No newline at end of file
--- a/evalNE_script.py
+++ b/evalNE_script.py
@@ -2,6 +2,7 @@ from evalne.evaluation.evaluator import LPEvaluator
 from evalne.evaluation.split import EvalSplit as LPEvalSplit
 from evalne.evaluation.score import Scoresheet
 from evalne.utils import preprocess as pp
+import networkx as nx

 from lib.utils import load_edgelist

@@ -14,14 +15,14 @@ parser.add_argument("-v","--verbose",action="store_true")
 args = parser.parse_args()#("data/fb_country_country_sample_6_size1000.txt".split())

 # Load and preprocess the network
-G = load_edgelist(args.edgelist_graph_filename,is_directed=True,weighted=True)
+G = nx.read_gexf(args.edgelist_graph_filename)#load_edgelist(args.edgelist_graph_filename,is_directed=True,weighted=True)
 G, _ = pp.prep_graph(G,maincc=True)

+
 # Create an evaluator and generate train/test edge split
 traintest_split = LPEvalSplit()
 traintest_split.compute_splits(G,split_alg="spanning_tree",train_frac=0.8,fe_ratio=1)
 nee = LPEvaluator(traintest_split)
-
 # Create a Scoresheet to store the results
 scoresheet = Scoresheet()

@@ -31,8 +32,11 @@ methods = ['random_prediction',
    'jaccard_coefficient',
    "adamic_adar_index",
    "preferential_attachment",
-    "resource_allocation_index"
-    ]
+    "resource_allocation_index",
+    "stochastic_block_model",
+    "stochastic_block_model_edge_probs",
+    "stochastic_block_model_degree_corrected"
+           ]

 # Evaluate baselines
 for method in methods:
@@ -42,7 +46,7 @@ for method in methods:
 try:
    # Check if OpenNE is installed
    import openne
-
+    a=0/0
    # Set embedding methods from OpenNE
    methods = "node2vec hope-opne gf sdne deepWalk line grarep".split() #lap-opne
    commands = [
@@ -64,7 +68,7 @@ try:
                                   edge_embedding_methods=edge_emb, input_delim=' ', output_delim=' ',  verbose=args.verbose)
        scoresheet.log_results(results)

-except ImportError:
+except Exception:
    print("The OpenNE library is not installed. Reporting results only for the baselines...")
    pass


--- a/explore_fb_dataset.ipynb
+++ b/explore_fb_dataset.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import networkx as nx\n",
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv(\"data/gadm1_nuts2_gadm1_nuts2_aug2020.tsv\",sep=\"\\t\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>user_loc</th>\n",
-       "      <th>fr_loc</th>\n",
-       "      <th>scaled_sci</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>ABW</td>\n",
-       "      <td>13297827</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>AGO1</td>\n",
-       "      <td>29</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>AGO10</td>\n",
-       "      <td>54</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>AGO11</td>\n",
-       "      <td>41</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>AGO12</td>\n",
-       "      <td>42</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5978020</th>\n",
-       "      <td>ZWE9</td>\n",
-       "      <td>ZWE5</td>\n",
-       "      <td>491990</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5978021</th>\n",
-       "      <td>ZWE9</td>\n",
-       "      <td>ZWE6</td>\n",
-       "      <td>524119</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5978022</th>\n",
-       "      <td>ZWE9</td>\n",
-       "      <td>ZWE7</td>\n",
-       "      <td>929477</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5978023</th>\n",
-       "      <td>ZWE9</td>\n",
-       "      <td>ZWE8</td>\n",
-       "      <td>966771</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5978024</th>\n",
-       "      <td>ZWE9</td>\n",
-       "      <td>ZWE9</td>\n",
-       "      <td>16951824</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5978025 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        user_loc fr_loc  scaled_sci\n",
-       "0            ABW    ABW    13297827\n",
-       "1            ABW   AGO1          29\n",
-       "2            ABW  AGO10          54\n",
-       "3            ABW  AGO11          41\n",
-       "4            ABW  AGO12          42\n",
-       "...          ...    ...         ...\n",
-       "5978020     ZWE9   ZWE5      491990\n",
-       "5978021     ZWE9   ZWE6      524119\n",
-       "5978022     ZWE9   ZWE7      929477\n",
-       "5978023     ZWE9   ZWE8      966771\n",
-       "5978024     ZWE9   ZWE9    16951824\n",
-       "\n",
-       "[5978025 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>key</th>\n",
-       "      <th>level</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ASM</td>\n",
-       "      <td>country</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>AND</td>\n",
-       "      <td>country</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ATG</td>\n",
-       "      <td>country</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>country</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>BHS</td>\n",
-       "      <td>country</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2440</th>\n",
-       "      <td>TR42</td>\n",
-       "      <td>nuts2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2441</th>\n",
-       "      <td>TR51</td>\n",
-       "      <td>nuts2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2442</th>\n",
-       "      <td>TR52</td>\n",
-       "      <td>nuts2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2443</th>\n",
-       "      <td>TR10</td>\n",
-       "      <td>nuts2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2444</th>\n",
-       "      <td>TR32</td>\n",
-       "      <td>nuts2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>2445 rows × 2 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       key    level\n",
-       "0      ASM  country\n",
-       "1      AND  country\n",
-       "2      ATG  country\n",
-       "3      ABW  country\n",
-       "4      BHS  country\n",
-       "...    ...      ...\n",
-       "2440  TR42    nuts2\n",
-       "2441  TR51    nuts2\n",
-       "2442  TR52    nuts2\n",
-       "2443  TR10    nuts2\n",
-       "2444  TR32    nuts2\n",
-       "\n",
-       "[2445 rows x 2 columns]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "level_df = pd.read_csv(\"data/gadm1_nuts2_levels.csv\")\n",
-    "level_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>key</th>\n",
-       "      <th>level</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ABW</td>\n",
-       "      <td>country</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   key    level\n",
-       "3  ABW  country"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "level_df[level_df.key == \"ABW\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>NUTS_ID</th>\n",
-       "      <th>LEVL_CODE</th>\n",
-       "      <th>CNTR_CODE</th>\n",
-       "      <th>NAME_LATN</th>\n",
-       "      <th>NUTS_NAME</th>\n",
-       "      <th>MOUNT_TYPE</th>\n",
-       "      <th>URBN_TYPE</th>\n",
-       "      <th>COAST_TYPE</th>\n",
-       "      <th>FID</th>\n",
-       "      <th>geometry</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>DE50</td>\n",
-       "      <td>DE50</td>\n",
-       "      <td>2</td>\n",
-       "      <td>DE</td>\n",
-       "      <td>Bremen</td>\n",
-       "      <td>Bremen</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>DE50</td>\n",
-       "      <td>MULTIPOLYGON (((4248229.070 3323043.884, 42345...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>DE60</td>\n",
-       "      <td>DE60</td>\n",
-       "      <td>2</td>\n",
-       "      <td>DE</td>\n",
-       "      <td>Hamburg</td>\n",
-       "      <td>Hamburg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>DE60</td>\n",
-       "      <td>MULTIPOLYGON (((4336708.861 3376535.119, 43414...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>DE71</td>\n",
-       "      <td>DE71</td>\n",
-       "      <td>2</td>\n",
-       "      <td>DE</td>\n",
-       "      <td>Darmstadt</td>\n",
-       "      <td>Darmstadt</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>DE71</td>\n",
-       "      <td>POLYGON ((4253056.068 3043343.224, 4257541.935...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>DE72</td>\n",
-       "      <td>DE72</td>\n",
-       "      <td>2</td>\n",
-       "      <td>DE</td>\n",
-       "      <td>Gießen</td>\n",
-       "      <td>Gießen</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>DE72</td>\n",
-       "      <td>POLYGON ((4248924.963 3092384.236, 4258523.883...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>DE73</td>\n",
-       "      <td>DE73</td>\n",
-       "      <td>2</td>\n",
-       "      <td>DE</td>\n",
-       "      <td>Kassel</td>\n",
-       "      <td>Kassel</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>DE73</td>\n",
-       "      <td>POLYGON ((4299188.570 3163540.672, 4298283.911...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>329</th>\n",
-       "      <td>HR06</td>\n",
-       "      <td>HR06</td>\n",
-       "      <td>2</td>\n",
-       "      <td>HR</td>\n",
-       "      <td>Sjeverna Hrvatska</td>\n",
-       "      <td>Sjeverna Hrvatska</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>HR06</td>\n",
-       "      <td>POLYGON ((4885838.460 2569452.540, 4878828.590...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330</th>\n",
-       "      <td>NO02</td>\n",
-       "      <td>NO02</td>\n",
-       "      <td>2</td>\n",
-       "      <td>NO</td>\n",
-       "      <td>Innlandet</td>\n",
-       "      <td>Innlandet</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NO02</td>\n",
-       "      <td>POLYGON ((4438332.480 4360687.112, 4440904.728...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>331</th>\n",
-       "      <td>NO06</td>\n",
-       "      <td>NO06</td>\n",
-       "      <td>2</td>\n",
-       "      <td>NO</td>\n",
-       "      <td>Trøndelag</td>\n",
-       "      <td>Trøndelag</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NO06</td>\n",
-       "      <td>MULTIPOLYGON (((4414585.206 4664076.456, 44179...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>332</th>\n",
-       "      <td>NO07</td>\n",
-       "      <td>NO07</td>\n",
-       "      <td>2</td>\n",
-       "      <td>NO</td>\n",
-       "      <td>Nord-Norge</td>\n",
-       "      <td>Nord-Norge</td>\n",
-       "      <td>0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NO07</td>\n",
-       "      <td>MULTIPOLYGON (((5073773.420 5207018.495, 50676...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>333</th>\n",
-       "      <td>NO08</td>\n",
-       "      <td>NO08</td>\n",
-       "      <td>2</td>\n",
-       "      <td>NO</td>\n",
-       "      <td>Oslo og Viken</td>\n",
-       "      <td>Oslo og Viken</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NO08</td>\n",
-       "      <td>POLYGON ((4424393.606 4083582.648, 4429455.154...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>334 rows × 11 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       id NUTS_ID  LEVL_CODE CNTR_CODE          NAME_LATN          NUTS_NAME  \\\n",
-       "0    DE50    DE50          2        DE             Bremen             Bremen   \n",
-       "1    DE60    DE60          2        DE            Hamburg            Hamburg   \n",
-       "2    DE71    DE71          2        DE          Darmstadt          Darmstadt   \n",
-       "3    DE72    DE72          2        DE             Gießen             Gießen   \n",
-       "4    DE73    DE73          2        DE             Kassel             Kassel   \n",
-       "..    ...     ...        ...       ...                ...                ...   \n",
-       "329  HR06    HR06          2        HR  Sjeverna Hrvatska  Sjeverna Hrvatska   \n",
-       "330  NO02    NO02          2        NO          Innlandet          Innlandet   \n",
-       "331  NO06    NO06          2        NO          Trøndelag          Trøndelag   \n",
-       "332  NO07    NO07          2        NO         Nord-Norge         Nord-Norge   \n",
-       "333  NO08    NO08          2        NO      Oslo og Viken      Oslo og Viken   \n",
-       "\n",
-       "     MOUNT_TYPE  URBN_TYPE  COAST_TYPE   FID  \\\n",
-       "0             0        NaN         NaN  DE50   \n",
-       "1             0        NaN         NaN  DE60   \n",
-       "2             0        NaN         NaN  DE71   \n",
-       "3             0        NaN         NaN  DE72   \n",
-       "4             0        NaN         NaN  DE73   \n",
-       "..          ...        ...         ...   ...   \n",
-       "329           0        0.0         0.0  HR06   \n",
-       "330           0        NaN         NaN  NO02   \n",
-       "331           0        NaN         NaN  NO06   \n",
-       "332           0        NaN         NaN  NO07   \n",
-       "333           0        0.0         0.0  NO08   \n",
-       "\n",
-       "                                              geometry  \n",
-       "0    MULTIPOLYGON (((4248229.070 3323043.884, 42345...  \n",
-       "1    MULTIPOLYGON (((4336708.861 3376535.119, 43414...  \n",
-       "2    POLYGON ((4253056.068 3043343.224, 4257541.935...  \n",
-       "3    POLYGON ((4248924.963 3092384.236, 4258523.883...  \n",
-       "4    POLYGON ((4299188.570 3163540.672, 4298283.911...  \n",
-       "..                                                 ...  \n",
-       "329  POLYGON ((4885838.460 2569452.540, 4878828.590...  \n",
-       "330  POLYGON ((4438332.480 4360687.112, 4440904.728...  \n",
-       "331  MULTIPOLYGON (((4414585.206 4664076.456, 44179...  \n",
-       "332  MULTIPOLYGON (((5073773.420 5207018.495, 50676...  \n",
-       "333  POLYGON ((4424393.606 4083582.648, 4429455.154...  \n",
-       "\n",
-       "[334 rows x 11 columns]"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import geopandas as gpd\n",
-    "gdf = gpd.read_file(\"data/ref-nuts-2021-10m/NUTS_RG_10M_2021_3035_LEVL_2.geojson\")\n",
-    "gdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'geopandas.geodataframe.GeoDataFrame'>\n",
-      "RangeIndex: 334 entries, 0 to 333\n",
-      "Data columns (total 11 columns):\n",
-      " #   Column      Non-Null Count  Dtype   \n",
-      "---  ------      --------------  -----   \n",
-      " 0   id          334 non-null    object  \n",
-      " 1   NUTS_ID     334 non-null    object  \n",
-      " 2   LEVL_CODE   334 non-null    int64   \n",
-      " 3   CNTR_CODE   334 non-null    object  \n",
-      " 4   NAME_LATN   334 non-null    object  \n",
-      " 5   NUTS_NAME   334 non-null    object  \n",
-      " 6   MOUNT_TYPE  334 non-null    int64   \n",
-      " 7   URBN_TYPE   7 non-null      float64 \n",
-      " 8   COAST_TYPE  7 non-null      float64 \n",
-      " 9   FID         334 non-null    object  \n",
-      " 10  geometry    334 non-null    geometry\n",
-      "dtypes: float64(2), geometry(1), int64(2), object(6)\n",
-      "memory usage: 28.8+ KB\n"
-     ]
-    }
-   ],
-   "source": [
-    "gdf.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python (my_env)",
-   "language": "python",
-   "name": "my_env"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-%% Cell type:code id: tags:
-
-``` python
-import networkx as nx
-import pandas as pd
-import numpy as np
-```
-
-%% Cell type:code id: tags:
-
-``` python
-df = pd.read_csv("data/gadm1_nuts2_gadm1_nuts2_aug2020.tsv",sep="\t")
-```
-
-%% Cell type:code id: tags:
-
-``` python
-df
-```
-
-%% Output
-
-            user_loc fr_loc  scaled_sci
-    0            ABW    ABW    13297827
-    1            ABW   AGO1          29
-    2            ABW  AGO10          54
-    3            ABW  AGO11          41
-    4            ABW  AGO12          42
-    ...          ...    ...         ...
-    5978020     ZWE9   ZWE5      491990
-    5978021     ZWE9   ZWE6      524119
-    5978022     ZWE9   ZWE7      929477
-    5978023     ZWE9   ZWE8      966771
-    5978024     ZWE9   ZWE9    16951824
-    
-    [5978025 rows x 3 columns]
-
-%% Cell type:code id: tags:
-
-``` python
-level_df = pd.read_csv("data/gadm1_nuts2_levels.csv")
-level_df
-```
-
-%% Output
-
-           key    level
-    0      ASM  country
-    1      AND  country
-    2      ATG  country
-    3      ABW  country
-    4      BHS  country
-    ...    ...      ...
-    2440  TR42    nuts2
-    2441  TR51    nuts2
-    2442  TR52    nuts2
-    2443  TR10    nuts2
-    2444  TR32    nuts2
-    
-    [2445 rows x 2 columns]
-
-%% Cell type:code id: tags:
-
-``` python
-level_df[level_df.key == "ABW"]
-```
-
-%% Output
-
-       key    level
-    3  ABW  country
-
-%% Cell type:code id: tags:
-
-``` python
-import geopandas as gpd
-gdf = gpd.read_file("data/ref-nuts-2021-10m/NUTS_RG_10M_2021_3035_LEVL_2.geojson")
-gdf
-```
-
-%% Output
-
-           id NUTS_ID  LEVL_CODE CNTR_CODE          NAME_LATN          NUTS_NAME  \
-    0    DE50    DE50          2        DE             Bremen             Bremen
-    1    DE60    DE60          2        DE            Hamburg            Hamburg
-    2    DE71    DE71          2        DE          Darmstadt          Darmstadt
-    3    DE72    DE72          2        DE             Gießen             Gießen
-    4    DE73    DE73          2        DE             Kassel             Kassel
-    ..    ...     ...        ...       ...                ...                ...
-    329  HR06    HR06          2        HR  Sjeverna Hrvatska  Sjeverna Hrvatska
-    330  NO02    NO02          2        NO          Innlandet          Innlandet
-    331  NO06    NO06          2        NO          Trøndelag          Trøndelag
-    332  NO07    NO07          2        NO         Nord-Norge         Nord-Norge
-    333  NO08    NO08          2        NO      Oslo og Viken      Oslo og Viken
-    
-         MOUNT_TYPE  URBN_TYPE  COAST_TYPE   FID  \
-    0             0        NaN         NaN  DE50
-    1             0        NaN         NaN  DE60
-    2             0        NaN         NaN  DE71
-    3             0        NaN         NaN  DE72
-    4             0        NaN         NaN  DE73
-    ..          ...        ...         ...   ...
-    329           0        0.0         0.0  HR06
-    330           0        NaN         NaN  NO02
-    331           0        NaN         NaN  NO06
-    332           0        NaN         NaN  NO07
-    333           0        0.0         0.0  NO08
-    
-                                                  geometry
-    0    MULTIPOLYGON (((4248229.070 3323043.884, 42345...
-    1    MULTIPOLYGON (((4336708.861 3376535.119, 43414...
-    2    POLYGON ((4253056.068 3043343.224, 4257541.935...
-    3    POLYGON ((4248924.963 3092384.236, 4258523.883...
-    4    POLYGON ((4299188.570 3163540.672, 4298283.911...
-    ..                                                 ...
-    329  POLYGON ((4885838.460 2569452.540, 4878828.590...
-    330  POLYGON ((4438332.480 4360687.112, 4440904.728...
-    331  MULTIPOLYGON (((4414585.206 4664076.456, 44179...
-    332  MULTIPOLYGON (((5073773.420 5207018.495, 50676...
-    333  POLYGON ((4424393.606 4083582.648, 4429455.154...
-    
-    [334 rows x 11 columns]
-
-%% Cell type:code id: tags:
-
-``` python
-gdf.info()
-```
-
-%% Output
-
-    <class 'geopandas.geodataframe.GeoDataFrame'>
-    RangeIndex: 334 entries, 0 to 333
-    Data columns (total 11 columns):
-     #   Column      Non-Null Count  Dtype
-    ---  ------      --------------  -----
-     0   id          334 non-null    object
-     1   NUTS_ID     334 non-null    object
-     2   LEVL_CODE   334 non-null    int64
-     3   CNTR_CODE   334 non-null    object
-     4   NAME_LATN   334 non-null    object
-     5   NUTS_NAME   334 non-null    object
-     6   MOUNT_TYPE  334 non-null    int64
-     7   URBN_TYPE   7 non-null      float64
-     8   COAST_TYPE  7 non-null      float64
-     9   FID         334 non-null    object
-     10  geometry    334 non-null    geometry
-    dtypes: float64(2), geometry(1), int64(2), object(6)
-    memory usage: 28.8+ KB
-
-%% Cell type:code id: tags:
-
-``` python
-```
--- a/generate_graph.ipynb
+++ b/generate_graph.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils import load_country_country_data,sample_with_pandas,to_edgelist\n",
-    "from joblib import dump\n",
-    "from sklearn.preprocessing import LabelEncoder\n",
-    "\n",
-    "df = load_country_country_data(\"data/country_country_aug2020.tsv\")\n",
-    "df[\"norm_scaled_sci\"] = df.scaled_sci/df.scaled_sci.sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!mkdir data/graph_second_s"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "encoder = LabelEncoder()\n",
-    "encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))\n",
-    "for i in range(10):\n",
-    "    for size in [50,100,200,500,1000]:\n",
-    "        test = sample_with_pandas(df,size)\n",
-    "        to_edgelist(test,encoder,weight=True).to_csv(\"data/graph_second_s/fb_country_country_sample_{0}_size{1}.txt\".format(i,size),index=False,header= False,sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['data/graph_second_s/encoder.joblib']"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dump(encoder,\"data/graph_second_s/encoder.joblib\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python (my_env)",
-   "language": "python",
-   "name": "my_env"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-%% Cell type:code id: tags:
-
-``` python
-%load_ext autoreload
-%autoreload 2
-```
-
-%% Cell type:code id: tags:
-
-``` python
-import numpy as np
-```
-
-%% Cell type:code id: tags:
-
-``` python
-from utils import load_country_country_data,sample_with_pandas,to_edgelist
-from joblib import dump
-from sklearn.preprocessing import LabelEncoder
-
-df = load_country_country_data("data/country_country_aug2020.tsv")
-df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()
-```
-
-%% Cell type:code id: tags:
-
-``` python
-!mkdir data/graph_second_s
-```
-
-%% Cell type:code id: tags:
-
-``` python
-encoder = LabelEncoder()
-encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))
-for i in range(10):
-    for size in [50,100,200,500,1000]:
-        test = sample_with_pandas(df,size)
-        to_edgelist(test,encoder,weight=True).to_csv("data/graph_second_s/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",")
-```
-
-%% Cell type:code id: tags:
-
-``` python
-dump(encoder,"data/graph_second_s/encoder.joblib")
-```
-
-%% Output
-
-    ['data/graph_second_s/encoder.joblib']
-
-%% Cell type:code id: tags:
-
-``` python
-```
--- a/generate_random_graph.py
+++ b/generate_random_graph.py
 import numpy as np

 from joblib import dump
+import networkx as nx
 from sklearn.preprocessing import LabelEncoder

 from lib.utils import load_country_country_data, sample_with_pandas, to_edgelist
@@ -24,7 +25,8 @@ if not os.path.exists(args.output_dir):

 # Load the data
 df = load_country_country_data(args.input_tsv,self_link=args.self_link)
-
+df["hash"] = df.apply(lambda row:"_".join(sorted([row.user_loc,row.fr_loc])),axis=1)
+df = df.drop_duplicates(subset=['hash'])
 # Normalise the sci index
 df["norm_scaled_sci"] = df.scaled_sci/df.scaled_sci.sum()

@@ -33,9 +35,11 @@ encoder.fit(np.concatenate((df.user_loc.values,df.fr_loc.values)))

 for i in range(args.n): # For a number of graph
    for size in args.dimensions: # Per size
-        test = sample_with_pandas(df,size) # sample edges using the normalised FB social interconnectedness index
-        output_df = to_edgelist(test,encoder,weight=True) # Parse to edgelist format
-        output_df.to_csv(args.output_dir + "/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",") # Save the output
+        test = sample_with_pandas(df,size) # sample edges using the normalised FB social interconnectedness indew
+        G = nx.from_pandas_edgelist(test, source="user_loc",target="fr_loc", edge_attr="weight", create_using=nx.Graph())
+        nx.write_gexf(G,args.output_dir + "/fb_country_country_sample_{0}_size{1}.gexf".format(i, size))
+        #output_df = to_edgelist(test,encoder,weight=True) # Parse to edgelist format
+        #output_df.to_csv(args.output_dir + "/fb_country_country_sample_{0}_size{1}.txt".format(i,size),index=False,header= False,sep=",") # Save the output

 # Save encoder to reverse the label transformation
-dump(encoder,args.output_dir + "/encoder.joblib")
\ No newline at end of file
+#dump(encoder,args.output_dir + "/encoder.joblib")
\ No newline at end of file
--- a/lib/draw.py
+++ b/lib/draw.py
@@ -49,7 +49,7 @@ def get_force_atlas(weight_influence=0, scaling_ratio=3.0, gravity=5):
    return forceatlas2


-def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_size=12, stroke_width=3,
+def draw(G, labels_dict={}, positions = {}, iteration_force_atlase=2000, figsize=(40, 20), font_size=12, stroke_width=3,
         stroke_color="black", font_color="white", edge_cmap=plt.cm.viridis, weight=True):
    """
    Return a figure of a NetworkX graph
@@ -82,10 +82,11 @@ def draw(G, labels_dict={}, iteration_force_atlase=2000, figsize=(40, 20), font_
    plt.gcf()  # Clean previous figure associated with the 'plt' instance

    # Compute node position using the Force Atlas algorithm
-    force_atlas = get_force_atlas()
-    positions = force_atlas.forceatlas2_networkx_layout(G,
-                                                        pos=None,
-                                                        iterations=iteration_force_atlase)
+    if not positions:
+        force_atlas = get_force_atlas()
+        positions = force_atlas.forceatlas2_networkx_layout(G,
+                                                            pos=None,
+                                                            iterations=iteration_force_atlase)
    # Initialise the figure canvas
    fig, ax = plt.subplots(1, figsize=figsize)


--- a/lib/utils.py
+++ b/lib/utils.py
@@ -19,8 +19,7 @@ def load_country_country_data(filename, self_link=False):
    pandas.Dataframe
        data
    """
-    df = pd.read_csv(filename, sep="\t")
-    df = df[(~df.user_loc.isna()) & (~df.fr_loc.isna())]
+    df = pd.read_csv(filename, sep="\t").fillna("NA")
    ign = ["CW", "XK"]  #  No coords for these two countries ... got to investigate!
    df = df[(~df.user_loc.isin(ign)) & (~df.fr_loc.isin(ign))]
    if not self_link:

--- a/run_eval.py
+++ b/run_eval.py
@@ -7,6 +7,7 @@ from lib.utils import load_edgelist
 import os
 import pandas as pd
 from tqdm import tqdm
+import networkx as nx

 import argparse

@@ -15,7 +16,7 @@ parser.add_argument("dataset_dir")
 parser.add_argument("output_filename")

 args = parser.parse_args()
-fns = glob.glob(args.dataset_dir + "/*.txt")
+fns = glob.glob(args.dataset_dir + "/*.gexf")

 all_res = []
 for fn in tqdm(fns):
@@ -27,8 +28,13 @@ for fn in tqdm(fns):
        continue
    df_results = parse_evalne_output(open(fn + "_results_lp").read())
    name = os.path.basename(fn)
-    edge_len = len(pd.read_csv(fn, sep="\t", header=None))
-    df_results["nb_edge"] = edge_len
+    G  = nx.read_gexf(fn)
+    top10node = pd.DataFrame(list(G.degree()), columns="node degree".split()).sort_values("degree",ascending=False).head(10).node.values
+    df_results["nb_edge"] = len(list(G.edges()))
+    df_results["transitivity"] = nx.transitivity(G)
+    df_results["density"] = nx.density(G)
+    df_results["top10_node"] = "|".join(top10node)
+    df_results["size"] = len(G)
    df_results["filename"] = name
    all_res.append(df_results)