final push

0ca7ed7f · yacinetouahria · 989d2e17 · 0ca7ed7f · 0ca7ed7f · 0ca7ed7f
Commit 0ca7ed7f authored 8 months ago by yacinetouahria
--- a/OTHER/anomaly_detector_selection/selecting_classifier_for_pvae.ipynb
+++ b/OTHER/anomaly_detector_selection/selecting_classifier_for_pvae.ipynb
--- a/OTHER/anomaly_detector_selection/selecting_outlier_detection_for_pvae.ipynb
+++ b/OTHER/anomaly_detector_selection/selecting_outlier_detection_for_pvae.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8620fcf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Ghypeddings import *\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f9f909f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score , f1_score , recall_score , precision_score , roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "id": "2ffdf671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = np.genfromtxt('hgcae_unsw_nb_5_embeddings_euc.csv', delimiter=',')\n",
+    "X = data[:,:-1]\n",
+    "y = data[:,-1]\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "0024ea51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def best_score_params(estimator,params):\n",
+    "    grid_search = GridSearchCV(estimator=estimator, param_grid=params, cv=5)\n",
+    "    grid_search.fit(X_train, y_train)\n",
+    "    best_params = grid_search.best_params_\n",
+    "    best_model = grid_search.best_estimator_\n",
+    "    y_pred = best_model.predict(X_test)\n",
+    "    accuracy = accuracy_score(y_test, y_pred)\n",
+    "    f1 = f1_score(y_test, y_pred)\n",
+    "    recall = recall_score(y_test, y_pred)\n",
+    "    precision = precision_score(y_test, y_pred)\n",
+    "    roc_auc = roc_auc_score(y_test, y_pred)\n",
+    "    print(accuracy,f1,recall,precision,roc_auc)\n",
+    "    cv_results = grid_search.cv_results_\n",
+    "    df = pd.DataFrame(cv_results)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "d1333b28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score\n",
+    "\n",
+    "def calculate_metrics(y_true,y_pred):\n",
+    "    acc = accuracy_score(y_true,y_pred)\n",
+    "    f1 = f1_score(y_true,y_pred)\n",
+    "    rec = recall_score(y_true,y_pred)\n",
+    "    pre = precision_score(y_true,y_pred)\n",
+    "    roc = roc_auc_score(y_true,y_pred)\n",
+    "    return acc,f1,rec,pre,roc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "id": "efe4163c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2:\n",
+      "(0.9492, 0.746, 0.746, 0.746, 0.8588888888888888)\n",
+      "3:\n",
+      "(0.9288, 0.644, 0.644, 0.644, 0.8022222222222223)\n",
+      "5:\n",
+      "(0.9548, 0.774, 0.774, 0.774, 0.8744444444444445)\n",
+      "10:\n",
+      "(0.932, 0.66, 0.66, 0.66, 0.8111111111111111)\n",
+      "20:\n",
+      "(0.9428, 0.714, 0.714, 0.714, 0.841111111111111)\n",
+      "40:\n",
+      "(0.9448, 0.724, 0.724, 0.724, 0.8466666666666667)\n",
+      "50:\n",
+      "(0.9412, 0.706, 0.706, 0.706, 0.8366666666666667)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "\n",
+    "def kmeans(X,y,n_clusters,outlier_percentage=.1):\n",
+    "    model = KMeans(n_clusters=n_clusters)\n",
+    "    model.fit(X)\n",
+    "    y_pred = model.predict(X)\n",
+    "    distances = model.transform(X).min(axis=1)\n",
+    "    threshold = np.percentile(distances, 100 * (1 - outlier_percentage))\n",
+    "    outliers = distances > threshold\n",
+    "    return calculate_metrics(y,outliers)\n",
+    "print('2:')\n",
+    "print(kmeans(X,y,2))\n",
+    "print('3:')\n",
+    "print(kmeans(X,y,3))\n",
+    "print('5:')\n",
+    "print(kmeans(X,y,5))\n",
+    "print('10:')\n",
+    "print(kmeans(X,y,10))\n",
+    "print('20:')\n",
+    "print(kmeans(X,y,20))\n",
+    "print('40:')\n",
+    "print(kmeans(X,y,50))\n",
+    "print('50:')\n",
+    "print(kmeans(X,y,50))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "170e9b21",
+   "metadata": {},
+   "source": [
+    "# LE reste"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 199,
+   "id": "da99d4c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5:\n",
+      "(0.8908, 0.024999999999999998, 0.014, 0.11666666666666667, 0.5011111111111112)\n",
+      "10:\n",
+      "(0.8926, 0.1408, 0.088, 0.352, 0.535)\n",
+      "20:\n",
+      "(0.8868, 0.23097826086956522, 0.17, 0.3601694915254237, 0.5682222222222222)\n",
+      "50:\n",
+      "(0.8664, 0.2707423580786026, 0.248, 0.2980769230769231, 0.5915555555555555)\n",
+      "100:\n",
+      "(0.789, 0.23606082548877624, 0.326, 0.18501702610669693, 0.5832222222222222)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import DBSCAN\n",
+    "\n",
+    "def dbscan(X,y,min_samples=5):\n",
+    "    dbscan = DBSCAN(eps=0.1, min_samples=min_samples)\n",
+    "    labels = dbscan.fit_predict(X)\n",
+    "    outliers = labels == -1\n",
+    "    return calculate_metrics(y,outliers)\n",
+    "print('5:')\n",
+    "print(dbscan(X,y,min_samples = 5))\n",
+    "print('10:')\n",
+    "print(dbscan(X,y,min_samples = 10))\n",
+    "print('20:')\n",
+    "print(dbscan(X,y,min_samples = 20))\n",
+    "print('50:')\n",
+    "print(dbscan(X,y,min_samples = 50))\n",
+    "print('100:')\n",
+    "print(dbscan(X,y,min_samples = 100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 200,
+   "id": "54cd340d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.8586, 0.2922922922922923, 0.292, 0.2925851703406814, 0.6067777777777777)"
+      ]
+     },
+     "execution_count": 200,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.ensemble import IsolationForest\n",
+    "\n",
+    "def isolation_forest(X,y,anomalies_percentage = 0.1):\n",
+    "    model = IsolationForest(contamination=anomalies_percentage)\n",
+    "    model.fit(X)\n",
+    "    y_pred = model.predict(X)\n",
+    "    y_pred[y_pred == 1] = 0\n",
+    "    y_pred[y_pred == -1]= 1\n",
+    "    return calculate_metrics(y,y_pred)\n",
+    "isolation_forest(X,y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "id": "f63108dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2:\n",
+      "(0.8486, 0.24072216649949846, 0.24, 0.2414486921529175, 0.5781111111111111)\n",
+      "3:\n",
+      "(0.8476, 0.23647294589178355, 0.236, 0.23694779116465864, 0.5757777777777777)\n",
+      "5:\n",
+      "(0.8516, 0.258, 0.258, 0.258, 0.5877777777777777)\n",
+      "10:\n",
+      "(0.8248, 0.124, 0.124, 0.124, 0.5133333333333334)\n",
+      "20:\n",
+      "(0.8352, 0.176, 0.176, 0.176, 0.5422222222222222)\n",
+      "40:\n",
+      "(0.84, 0.20000000000000004, 0.2, 0.2, 0.5555555555555555)\n",
+      "50:\n",
+      "(0.842, 0.20999999999999996, 0.21, 0.21, 0.5611111111111111)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "\n",
+    "def kmeans(X,y,n_clusters,outlier_percentage=.1):\n",
+    "    model = KMeans(n_clusters=n_clusters)\n",
+    "    model.fit(X)\n",
+    "    y_pred = model.predict(X)\n",
+    "    distances = model.transform(X).min(axis=1)\n",
+    "    threshold = np.percentile(distances, 100 * (1 - outlier_percentage))\n",
+    "    outliers = distances > threshold\n",
+    "    return calculate_metrics(y,outliers)\n",
+    "print('2:')\n",
+    "print(kmeans(X,y,2))\n",
+    "print('3:')\n",
+    "print(kmeans(X,y,3))\n",
+    "print('5:')\n",
+    "print(kmeans(X,y,5))\n",
+    "print('10:')\n",
+    "print(kmeans(X,y,10))\n",
+    "print('20:')\n",
+    "print(kmeans(X,y,20))\n",
+    "print('40:')\n",
+    "print(kmeans(X,y,50))\n",
+    "print('50:')\n",
+    "print(kmeans(X,y,50))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 202,
+   "id": "7abc7f6e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5:\n",
+      "(0.8272, 0.136, 0.136, 0.136, 0.52)\n",
+      "10:\n",
+      "(0.8272, 0.136, 0.136, 0.136, 0.52)\n",
+      "15:\n",
+      "(0.8272, 0.136, 0.136, 0.136, 0.52)\n",
+      "20\n",
+      "(0.83, 0.15, 0.15, 0.15, 0.5277777777777778)\n",
+      "30\n",
+      "(0.83, 0.15, 0.15, 0.15, 0.5277777777777778)\n",
+      "50\n",
+      "(0.8572, 0.286, 0.286, 0.286, 0.6033333333333333)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "\n",
+    "\n",
+    "def local_outlier_factor(X,y,n_neighbors=20,outlier_percentage=.1):\n",
+    "    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=outlier_percentage)\n",
+    "    y_pred = lof.fit_predict(X)\n",
+    "    y_pred[y_pred == 1] = 0\n",
+    "    y_pred[y_pred == -1] = 1\n",
+    "    return calculate_metrics(y,y_pred)\n",
+    "\n",
+    "print('5:')\n",
+    "print(local_outlier_factor(X,y,n_neighbors=5))\n",
+    "print('10:')\n",
+    "print(local_outlier_factor(X,y,n_neighbors=5))\n",
+    "print('15:')\n",
+    "print(local_outlier_factor(X,y,n_neighbors=5))\n",
+    "print('20')\n",
+    "print(local_outlier_factor(X,y,n_neighbors=20))\n",
+    "print('30')\n",
+    "print(local_outlier_factor(X,y,n_neighbors=20))\n",
+    "print('50')\n",
+    "print(local_outlier_factor(X,y,n_neighbors=50))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 203,
+   "id": "40d8f286",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rbf:\n",
+      "(0.7028, 0.23638232271325796, 0.46, 0.1590594744121715, 0.594888888888889)\n",
+      "linear:\n",
+      "(0.1028, 0.17868912486268765, 0.976, 0.09834744054816606, 0.4908888888888889)\n",
+      "poly:\n",
+      "(0.1, 0.18181818181818182, 1.0, 0.1, 0.5)\n",
+      "sigmoid:\n",
+      "(0.424, 0.17383820998278832, 0.606, 0.10147354320160751, 0.5048888888888889)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.svm import OneClassSVM\n",
+    "\n",
+    "def one_class_svm(X,y, kernel='rbf',nu=0.1):\n",
+    "    model = OneClassSVM(kernel=kernel, nu=nu)\n",
+    "    model.fit(X)\n",
+    "    y_pred = model.predict(X)\n",
+    "    y_pred[y_pred == 1]=0\n",
+    "    y_pred[y_pred == -1] = 1\n",
+    "    return calculate_metrics(y,y_pred)\n",
+    "\n",
+    "print('rbf:')\n",
+    "print(one_class_svm(X,y,kernel='rbf'))\n",
+    "print('linear:')\n",
+    "print(one_class_svm(X,y,kernel='linear'))\n",
+    "print('poly:')\n",
+    "print(one_class_svm(X,y,kernel='poly'))\n",
+    "print('sigmoid:')\n",
+    "print(one_class_svm(X,y,kernel='sigmoid'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f136115",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abb5b48b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78c516c1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4480a184",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6220f3c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6b34d26",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:8620fcf3 tags:
+``` python
+from Ghypeddings import *
+import pandas as pd
+import numpy as np
+import pickle
+```
+%% Cell type:code id:f9f909f0 tags:
+``` python
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score , f1_score , recall_score , precision_score , roc_auc_score
+```
+%% Cell type:code id:2ffdf671 tags:
+``` python
+data = np.genfromtxt('hgcae_unsw_nb_5_embeddings_euc.csv', delimiter=',')
+X = data[:,:-1]
+y = data[:,-1]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
+```
+%% Cell type:code id:0024ea51 tags:
+``` python
+def best_score_params(estimator,params):
+    grid_search = GridSearchCV(estimator=estimator, param_grid=params, cv=5)
+    grid_search.fit(X_train, y_train)
+    best_params = grid_search.best_params_
+    best_model = grid_search.best_estimator_
+    y_pred = best_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    roc_auc = roc_auc_score(y_test, y_pred)
+    print(accuracy,f1,recall,precision,roc_auc)
+    cv_results = grid_search.cv_results_
+    df = pd.DataFrame(cv_results)
+    return df
+```
+%% Cell type:code id:d1333b28 tags:
+``` python
+from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
+def calculate_metrics(y_true,y_pred):
+    acc = accuracy_score(y_true,y_pred)
+    f1 = f1_score(y_true,y_pred)
+    rec = recall_score(y_true,y_pred)
+    pre = precision_score(y_true,y_pred)
+    roc = roc_auc_score(y_true,y_pred)
+    return acc,f1,rec,pre,roc
+```
+%% Cell type:code id:efe4163c tags:
+``` python
+from sklearn.cluster import KMeans
+def kmeans(X,y,n_clusters,outlier_percentage=.1):
+    model = KMeans(n_clusters=n_clusters)
+    model.fit(X)
+    y_pred = model.predict(X)
+    distances = model.transform(X).min(axis=1)
+    threshold = np.percentile(distances, 100 * (1 - outlier_percentage))
+    outliers = distances > threshold
+    return calculate_metrics(y,outliers)
+print('2:')
+print(kmeans(X,y,2))
+print('3:')
+print(kmeans(X,y,3))
+print('5:')
+print(kmeans(X,y,5))
+print('10:')
+print(kmeans(X,y,10))
+print('20:')
+print(kmeans(X,y,20))
+print('40:')
+print(kmeans(X,y,50))
+print('50:')
+print(kmeans(X,y,50))
+```
+%% Output
+    2:
+    (0.9492, 0.746, 0.746, 0.746, 0.8588888888888888)
+    3:
+    (0.9288, 0.644, 0.644, 0.644, 0.8022222222222223)
+    5:
+    (0.9548, 0.774, 0.774, 0.774, 0.8744444444444445)
+    10:
+    (0.932, 0.66, 0.66, 0.66, 0.8111111111111111)
+    20:
+    (0.9428, 0.714, 0.714, 0.714, 0.841111111111111)
+    40:
+    (0.9448, 0.724, 0.724, 0.724, 0.8466666666666667)
+    50:
+    (0.9412, 0.706, 0.706, 0.706, 0.8366666666666667)
+%% Cell type:markdown id:170e9b21 tags:
+# LE reste
+%% Cell type:code id:da99d4c2 tags:
+``` python
+from sklearn.cluster import DBSCAN
+def dbscan(X,y,min_samples=5):
+    dbscan = DBSCAN(eps=0.1, min_samples=min_samples)
+    labels = dbscan.fit_predict(X)
+    outliers = labels == -1
+    return calculate_metrics(y,outliers)
+print('5:')
+print(dbscan(X,y,min_samples = 5))
+print('10:')
+print(dbscan(X,y,min_samples = 10))
+print('20:')
+print(dbscan(X,y,min_samples = 20))
+print('50:')
+print(dbscan(X,y,min_samples = 50))
+print('100:')
+print(dbscan(X,y,min_samples = 100))
+```
+%% Output
+    5:
+    (0.8908, 0.024999999999999998, 0.014, 0.11666666666666667, 0.5011111111111112)
+    10:
+    (0.8926, 0.1408, 0.088, 0.352, 0.535)
+    20:
+    (0.8868, 0.23097826086956522, 0.17, 0.3601694915254237, 0.5682222222222222)
+    50:
+    (0.8664, 0.2707423580786026, 0.248, 0.2980769230769231, 0.5915555555555555)
+    100:
+    (0.789, 0.23606082548877624, 0.326, 0.18501702610669693, 0.5832222222222222)
+%% Cell type:code id:54cd340d tags:
+``` python
+from sklearn.ensemble import IsolationForest
+def isolation_forest(X,y,anomalies_percentage = 0.1):
+    model = IsolationForest(contamination=anomalies_percentage)
+    model.fit(X)
+    y_pred = model.predict(X)
+    y_pred[y_pred == 1] = 0
+    y_pred[y_pred == -1]= 1
+    return calculate_metrics(y,y_pred)
+isolation_forest(X,y)
+```
+%% Output
+    (0.8586, 0.2922922922922923, 0.292, 0.2925851703406814, 0.6067777777777777)
+%% Cell type:code id:f63108dd tags:
+``` python
+from sklearn.cluster import KMeans
+def kmeans(X,y,n_clusters,outlier_percentage=.1):
+    model = KMeans(n_clusters=n_clusters)
+    model.fit(X)
+    y_pred = model.predict(X)
+    distances = model.transform(X).min(axis=1)
+    threshold = np.percentile(distances, 100 * (1 - outlier_percentage))
+    outliers = distances > threshold
+    return calculate_metrics(y,outliers)
+print('2:')
+print(kmeans(X,y,2))
+print('3:')
+print(kmeans(X,y,3))
+print('5:')
+print(kmeans(X,y,5))
+print('10:')
+print(kmeans(X,y,10))
+print('20:')
+print(kmeans(X,y,20))
+print('40:')
+print(kmeans(X,y,50))
+print('50:')
+print(kmeans(X,y,50))
+```
+%% Output
+    2:
+    (0.8486, 0.24072216649949846, 0.24, 0.2414486921529175, 0.5781111111111111)
+    3:
+    (0.8476, 0.23647294589178355, 0.236, 0.23694779116465864, 0.5757777777777777)
+    5:
+    (0.8516, 0.258, 0.258, 0.258, 0.5877777777777777)
+    10:
+    (0.8248, 0.124, 0.124, 0.124, 0.5133333333333334)
+    20:
+    (0.8352, 0.176, 0.176, 0.176, 0.5422222222222222)
+    40:
+    (0.84, 0.20000000000000004, 0.2, 0.2, 0.5555555555555555)
+    50:
+    (0.842, 0.20999999999999996, 0.21, 0.21, 0.5611111111111111)
+%% Cell type:code id:7abc7f6e tags:
+``` python
+from sklearn.neighbors import LocalOutlierFactor
+def local_outlier_factor(X,y,n_neighbors=20,outlier_percentage=.1):
+    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=outlier_percentage)
+    y_pred = lof.fit_predict(X)
+    y_pred[y_pred == 1] = 0
+    y_pred[y_pred == -1] = 1
+    return calculate_metrics(y,y_pred)
+print('5:')
+print(local_outlier_factor(X,y,n_neighbors=5))
+print('10:')
+print(local_outlier_factor(X,y,n_neighbors=5))
+print('15:')
+print(local_outlier_factor(X,y,n_neighbors=5))
+print('20')
+print(local_outlier_factor(X,y,n_neighbors=20))
+print('30')
+print(local_outlier_factor(X,y,n_neighbors=20))
+print('50')
+print(local_outlier_factor(X,y,n_neighbors=50))
+```
+%% Output
+    5:
+    (0.8272, 0.136, 0.136, 0.136, 0.52)
+    10:
+    (0.8272, 0.136, 0.136, 0.136, 0.52)
+    15:
+    (0.8272, 0.136, 0.136, 0.136, 0.52)
+    20
+    (0.83, 0.15, 0.15, 0.15, 0.5277777777777778)
+    30
+    (0.83, 0.15, 0.15, 0.15, 0.5277777777777778)
+    50
+    (0.8572, 0.286, 0.286, 0.286, 0.6033333333333333)
+%% Cell type:code id:40d8f286 tags:
+``` python
+from sklearn.svm import OneClassSVM
+def one_class_svm(X,y, kernel='rbf',nu=0.1):
+    model = OneClassSVM(kernel=kernel, nu=nu)
+    model.fit(X)
+    y_pred = model.predict(X)
+    y_pred[y_pred == 1]=0
+    y_pred[y_pred == -1] = 1
+    return calculate_metrics(y,y_pred)
+print('rbf:')
+print(one_class_svm(X,y,kernel='rbf'))
+print('linear:')
+print(one_class_svm(X,y,kernel='linear'))
+print('poly:')
+print(one_class_svm(X,y,kernel='poly'))
+print('sigmoid:')
+print(one_class_svm(X,y,kernel='sigmoid'))
+```
+%% Output
+    rbf:
+    (0.7028, 0.23638232271325796, 0.46, 0.1590594744121715, 0.594888888888889)
+    linear:
+    (0.1028, 0.17868912486268765, 0.976, 0.09834744054816606, 0.4908888888888889)
+    poly:
+    (0.1, 0.18181818181818182, 1.0, 0.1, 0.5)
+    sigmoid:
+    (0.424, 0.17383820998278832, 0.606, 0.10147354320160751, 0.5048888888888889)
+%% Cell type:code id:2f136115 tags:
+``` python
+```
+%% Cell type:code id:abb5b48b tags:
+``` python
+```
+%% Cell type:code id:78c516c1 tags:
+``` python
+```
+%% Cell type:code id:4480a184 tags:
+``` python
+```
+%% Cell type:code id:f6220f3c tags:
+``` python
+```
+%% Cell type:code id:d6b34d26 tags:
+``` python
+```
--- a/OTHER/anomaly_detector_selection/selection_clustering_algorithm.ipynb
+++ b/OTHER/anomaly_detector_selection/selection_clustering_algorithm.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bf96a516",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3399745c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score\n",
+    "def calculate_metrics(y_true,y_pred):\n",
+    "    acc = accuracy_score(y_true,y_pred)\n",
+    "    f1 = f1_score(y_true,y_pred)\n",
+    "    rec = recall_score(y_true,y_pred)\n",
+    "    pre = precision_score(y_true,y_pred)\n",
+    "    roc = roc_auc_score(y_true,y_pred)\n",
+    "    return [round(acc,4),round(f1,4),round(rec,4),round(pre,4),round(roc,4)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f2790afd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def group_clusters(y_true,y_pred):\n",
+    "    pairs = []\n",
+    "    for k,v in zip(y_true,y_pred):\n",
+    "        tup = (int(k),v)\n",
+    "        pairs.append(tup)\n",
+    "    occurrences = {}\n",
+    "    for item in pairs:\n",
+    "        if item in occurrences:\n",
+    "            occurrences[item] += 1\n",
+    "        else:\n",
+    "            occurrences[item] = 1\n",
+    "    a = sorted(occurrences.items(), key=lambda item: item[1])[::-1]\n",
+    "    normal,attack = [],[]\n",
+    "    for item in a:\n",
+    "        if item[0][1] not in normal and item[0][1] not in attack:\n",
+    "            if item[0][0] == 0:\n",
+    "                normal.append(item[0][1])\n",
+    "            else:\n",
+    "                attack.append(item[0][1])\n",
+    "    for i in normal:\n",
+    "        y_pred[y_pred == i] = 0\n",
+    "    for j in attack:\n",
+    "        y_pred[y_pred == j] = 1\n",
+    "    return y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "49759d76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "def agglomerative_clustering(X,y,n_clusters = 100):\n",
+    "    model = AgglomerativeClustering(n_clusters=n_clusters)\n",
+    "    labels = model.fit_predict(X)\n",
+    "    labels = group_clusters(y,labels)\n",
+    "    return calculate_metrics(y,labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b4bbd3f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import DBSCAN\n",
+    "def dbscan(X,y,eps=1e-2,min_samples=20):\n",
+    "    model = DBSCAN(eps=eps, min_samples=min_samples)\n",
+    "    y_pred = model.fit_predict(X)\n",
+    "    mask = y_pred != -1\n",
+    "    y_true_filtered = y[mask]\n",
+    "    y_pred_filtered = y_pred[mask]\n",
+    "    y_pred_filtered = group_clusters(y_true_filtered,y_pred_filtered)\n",
+    "    return calculate_metrics(y_true_filtered,y_pred_filtered)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "05d6a66a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import skfuzzy as fuzz\n",
+    "def fuzzy_c_mean(X,y,n_clusters=10,power=2,error=0.01,maxiter=1000,init=None):\n",
+    "    X_transposed = np.transpose(X)\n",
+    "    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X_transposed, n_clusters, power, error=error, maxiter=maxiter, init=init)\n",
+    "    y_pred = np.argmax(u, axis=0)\n",
+    "    y_pred = group_clusters(y,y_pred)\n",
+    "    return calculate_metrics(y,y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0c26e329",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.mixture import GaussianMixture\n",
+    "def gaussian_mixture(X,y,n_components=20):\n",
+    "    model = GaussianMixture(n_components=n_components)\n",
+    "    y_pred = model.fit_predict(X)\n",
+    "    y_pred = group_clusters(y,y_pred)\n",
+    "    return calculate_metrics(y,y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f4226733",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "def kmeans(X,y,n_clusters=10,n_init=10):\n",
+    "    model = KMeans(n_clusters=n_clusters,n_init=n_init)\n",
+    "    model.fit(X)\n",
+    "    y_pred = model.labels_\n",
+    "    y_pred = group_clusters(y,y_pred)\n",
+    "    return calculate_metrics(y,y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6a07e67b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import MeanShift\n",
+    "def mean_shift(X,y):\n",
+    "    y_pred = MeanShift(n_jobs=-1,max_iter=10).fit_predict(X)\n",
+    "    y_pred = group_clusters(y,y_pred)\n",
+    "    return calculate_metrics(y,y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "23bac324",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datasets = ['awid3','bot_iot','ddos2019','darknet','ids2018','ton_iot','unsw_nb15']\n",
+    "models = ['hgcae','pvae']\n",
+    "dim = 20"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15cd3d01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for model in models:\n",
+    "    for dataset in datasets:\n",
+    "        print(dataset,\"--------------------------\")\n",
+    "        file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')\n",
+    "        data = np.genfromtxt(file, delimiter=\",\", usemask=True)\n",
+    "        X = data[:,:-1]\n",
+    "        y = data[:,-1]\n",
+    "        results =[]\n",
+    "        results.append(agglomerative_clustering(X,y))\n",
+    "        print('agglomerative',results[-1])\n",
+    "        results.append(dbscan(X,y))\n",
+    "        print('dbscan',results[-1])\n",
+    "        results.append(fuzzy_c_mean(X,y))\n",
+    "        print('fuzzy c mean',results[-1])\n",
+    "        results.append(gaussian_mixture(X,y))\n",
+    "        print('gaussian',results[-1])\n",
+    "        results.append(kmeans(X,y))\n",
+    "        print('kmeans',results[-1])\n",
+    "        results.append(mean_shift(X,y))\n",
+    "        print('mean shift',results[-1])\n",
+    "        df = pd.DataFrame(np.array(results))\n",
+    "        df.to_csv(f'{model}_{dataset}.csv',index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ab2d406e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dbscan [0.4744, 0.6435, 1.0, 0.4744, 0.5]\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = 'ton_iot'\n",
+    "model = 'hgcae'\n",
+    "dim = 20\n",
+    "file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')\n",
+    "data = np.genfromtxt(file, delimiter=\",\", usemask=True)\n",
+    "X = data[:,:-1]\n",
+    "y = data[:,-1]\n",
+    "results =[]\n",
+    "# c = [60,70,80]\n",
+    "# for i in c:\n",
+    "#     results.append(agglomerative_clustering(X,y,n_clusters = i))\n",
+    "#     print('agglomerative',results[-1])\n",
+    "\n",
+    "#results.append(dbscan(X,y))\n",
+    "#print('dbscan',results[-1])\n",
+    "# c = [2,5,10,20,50,70,100,150,200]\n",
+    "# for i in c:\n",
+    "#     results.append(fuzzy_c_mean(X,y,n_clusters=i))\n",
+    "#     print('fuzzy c mean',results[-1])\n",
+    "# c = [2,5,10,20,50,100,200]\n",
+    "# for i in c:\n",
+    "#     results.append(gaussian_mixture(X,y,n_components=i))\n",
+    "#     print('gaussian',results[-1])\n",
+    "# c = []\n",
+    "# results.append(kmeans(X,y))\n",
+    "# print('kmeans',results[-1])\n",
+    "# results.append(mean_shift(X,y))\n",
+    "# print('mean shift',results[-1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd862da6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7760d5f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da32a2d7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:bf96a516 tags:
+``` python
+import numpy as np
+import os
+import pandas as pd
+```
+%% Cell type:code id:3399745c tags:
+``` python
+from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
+def calculate_metrics(y_true,y_pred):
+    acc = accuracy_score(y_true,y_pred)
+    f1 = f1_score(y_true,y_pred)
+    rec = recall_score(y_true,y_pred)
+    pre = precision_score(y_true,y_pred)
+    roc = roc_auc_score(y_true,y_pred)
+    return [round(acc,4),round(f1,4),round(rec,4),round(pre,4),round(roc,4)]
+```
+%% Cell type:code id:f2790afd tags:
+``` python
+def group_clusters(y_true,y_pred):
+    pairs = []
+    for k,v in zip(y_true,y_pred):
+        tup = (int(k),v)
+        pairs.append(tup)
+    occurrences = {}
+    for item in pairs:
+        if item in occurrences:
+            occurrences[item] += 1
+        else:
+            occurrences[item] = 1
+    a = sorted(occurrences.items(), key=lambda item: item[1])[::-1]
+    normal,attack = [],[]
+    for item in a:
+        if item[0][1] not in normal and item[0][1] not in attack:
+            if item[0][0] == 0:
+                normal.append(item[0][1])
+            else:
+                attack.append(item[0][1])
+    for i in normal:
+        y_pred[y_pred == i] = 0
+    for j in attack:
+        y_pred[y_pred == j] = 1
+    return y_pred
+```
+%% Cell type:code id:49759d76 tags:
+``` python
+from sklearn.cluster import AgglomerativeClustering
+def agglomerative_clustering(X,y,n_clusters = 100):
+    model = AgglomerativeClustering(n_clusters=n_clusters)
+    labels = model.fit_predict(X)
+    labels = group_clusters(y,labels)
+    return calculate_metrics(y,labels)
+```
+%% Cell type:code id:b4bbd3f7 tags:
+``` python
+from sklearn.cluster import DBSCAN
+def dbscan(X,y,eps=1e-2,min_samples=20):
+    model = DBSCAN(eps=eps, min_samples=min_samples)
+    y_pred = model.fit_predict(X)
+    mask = y_pred != -1
+    y_true_filtered = y[mask]
+    y_pred_filtered = y_pred[mask]
+    y_pred_filtered = group_clusters(y_true_filtered,y_pred_filtered)
+    return calculate_metrics(y_true_filtered,y_pred_filtered)
+```
+%% Cell type:code id:05d6a66a tags:
+``` python
+import skfuzzy as fuzz
+def fuzzy_c_mean(X,y,n_clusters=10,power=2,error=0.01,maxiter=1000,init=None):
+    X_transposed = np.transpose(X)
+    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X_transposed, n_clusters, power, error=error, maxiter=maxiter, init=init)
+    y_pred = np.argmax(u, axis=0)
+    y_pred = group_clusters(y,y_pred)
+    return calculate_metrics(y,y_pred)
+```
+%% Cell type:code id:0c26e329 tags:
+``` python
+from sklearn.mixture import GaussianMixture
+def gaussian_mixture(X,y,n_components=20):
+    model = GaussianMixture(n_components=n_components)
+    y_pred = model.fit_predict(X)
+    y_pred = group_clusters(y,y_pred)
+    return calculate_metrics(y,y_pred)
+```
+%% Cell type:code id:f4226733 tags:
+``` python
+from sklearn.cluster import KMeans
+def kmeans(X,y,n_clusters=10,n_init=10):
+    model = KMeans(n_clusters=n_clusters,n_init=n_init)
+    model.fit(X)
+    y_pred = model.labels_
+    y_pred = group_clusters(y,y_pred)
+    return calculate_metrics(y,y_pred)
+```
+%% Cell type:code id:6a07e67b tags:
+``` python
+from sklearn.cluster import MeanShift
+def mean_shift(X,y):
+    y_pred = MeanShift(n_jobs=-1,max_iter=10).fit_predict(X)
+    y_pred = group_clusters(y,y_pred)
+    return calculate_metrics(y,y_pred)
+```
+%% Cell type:code id:23bac324 tags:
+``` python
+datasets = ['awid3','bot_iot','ddos2019','darknet','ids2018','ton_iot','unsw_nb15']
+models = ['hgcae','pvae']
+dim = 20
+```
+%% Cell type:code id:15cd3d01 tags:
+``` python
+for model in models:
+    for dataset in datasets:
+        print(dataset,"--------------------------")
+        file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')
+        data = np.genfromtxt(file, delimiter=",", usemask=True)
+        X = data[:,:-1]
+        y = data[:,-1]
+        results =[]
+        results.append(agglomerative_clustering(X,y))
+        print('agglomerative',results[-1])
+        results.append(dbscan(X,y))
+        print('dbscan',results[-1])
+        results.append(fuzzy_c_mean(X,y))
+        print('fuzzy c mean',results[-1])
+        results.append(gaussian_mixture(X,y))
+        print('gaussian',results[-1])
+        results.append(kmeans(X,y))
+        print('kmeans',results[-1])
+        results.append(mean_shift(X,y))
+        print('mean shift',results[-1])
+        df = pd.DataFrame(np.array(results))
+        df.to_csv(f'{model}_{dataset}.csv',index=False)
+```
+%% Cell type:code id:ab2d406e tags:
+``` python
+dataset = 'ton_iot'
+model = 'hgcae'
+dim = 20
+file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')
+data = np.genfromtxt(file, delimiter=",", usemask=True)
+X = data[:,:-1]
+y = data[:,-1]
+results =[]
+# c = [60,70,80]
+# for i in c:
+#     results.append(agglomerative_clustering(X,y,n_clusters = i))
+#     print('agglomerative',results[-1])
+#results.append(dbscan(X,y))
+#print('dbscan',results[-1])
+# c = [2,5,10,20,50,70,100,150,200]
+# for i in c:
+#     results.append(fuzzy_c_mean(X,y,n_clusters=i))
+#     print('fuzzy c mean',results[-1])
+# c = [2,5,10,20,50,100,200]
+# for i in c:
+#     results.append(gaussian_mixture(X,y,n_components=i))
+#     print('gaussian',results[-1])
+# c = []
+# results.append(kmeans(X,y))
+# print('kmeans',results[-1])
+# results.append(mean_shift(X,y))
+# print('mean shift',results[-1])
+```
+%% Output
+    dbscan [0.4744, 0.6435, 1.0, 0.4744, 0.5]
+%% Cell type:code id:fd862da6 tags:
+``` python
+```
+%% Cell type:code id:c7760d5f tags:
+``` python
+```
+%% Cell type:code id:da32a2d7 tags:
+``` python
+```
--- a/OTHER/calculate_hyperbolicity.py
+++ b/OTHER/calculate_hyperbolicity.py
+# NOTE: this file calcules the hyperbolicity of all the datasets
+import numpy as np
+from Ghypeddings.datasets.utils import hyperbolicity
+from Ghypeddings.datasets.datasets import CIC_DDoS2019,AWID3,NF_CIC_IDS2018_v2,Darknet,NF_TON_IoT_v2,NF_BOT_IoT_v2,NF_UNSW_NB15_v2
+datasets = [['ddos2019',CIC_DDoS2019],
+            ['awid3',AWID3],
+            ['ids2018',NF_CIC_IDS2018_v2],
+            ['darknet',Darknet],
+            ['ton_iot',NF_TON_IoT_v2],
+            ['bot_iot',NF_BOT_IoT_v2],
+            ['unsw_nb',NF_UNSW_NB15_v2]]
+for dataset in datasets:
+    h_mean = []
+    for i in range(5):
+        adj,_,_ = dataset[1]().load_samples(repetition=0)
+        h = hyperbolicity(adj,num_samples=10)
+        h_mean.append(h)
+    print(dataset[0],np.mean(h_mean))
\ No newline at end of file
--- a/OTHER/group_all_dataset_files.py
+++ b/OTHER/group_all_dataset_files.py
+# NOTE: add this .py file inside the folder containing all the files of single dataset and run it
+# to concatenate all those files into a single one
+# this helps sampling a representative data
+# this is the first OTHER script used after downloading the datasets
+import os 
+import pandas as pd
+def run():
+    directory = os.getcwd()
+    files = [f for f in os.listdir(directory) if os.path.isfile(f) and '.py' not in f]
+    all = []
+    for file in files:
+        df = pd.read_csv(file,low_memory=False)
+        all.append(df)
+    data = pd.concat(all)
+    # change the name of the dataset here
+    path = os.path.join(directory,'ton_iot.csv')
+    data.to_csv(path,index=False)
+if __name__ == "__main__":
+    run()
\ No newline at end of file
--- a/OTHER/snapshot_generation/NOTES.txt
+++ b/OTHER/snapshot_generation/NOTES.txt
+In order to generate representative snapshots we had to reorganize the files of the dataset.
+You have to execute those scripts in the following order:
+1- group attack: to group all the attacks into single file
+2- group normal attack: group the attack file with the normal file
+3- execute one of the three scripts which correspond each to a specific dataset
\ No newline at end of file
--- a/OTHER/snapshot_generation/darknet_snapshots_generation.py
+++ b/OTHER/snapshot_generation/darknet_snapshots_generation.py
+# NOTE: This file generates the snapshots from the darknet dataset. It does everything starting by the cleaning and moving to data spliting.
+import os
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import pickle
+def _to_binary_classification(x):
+    if 'Non' in x:
+        return 0
+    else:
+        return 1
+def _filling_adjacency_numpy(data):
+    N = data.shape[0]
+    try:
+        adjacency = np.zeros((N,N), dtype=bool)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    source_ips = data['Src IP'].to_numpy()
+    destination_ips = data['Dst IP'].to_numpy()
+    mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) )
+    adjacency[mask] = True
+    return adjacency
+def save_samples(adj,features,labels,adj_path,features_path,labels_path):
+        with open(adj_path,'wb') as f:
+            pickle.dump(adj,f)
+        with open(features_path,'wb') as f:
+            pickle.dump(features,f)
+        with open(labels_path,'wb') as f:
+            pickle.dump(labels,f)
+nnodes = 1000
+overlap = 0.25
+directory = os.path.join(os.getcwd(),f'darknet_snapshots_{int(overlap*100)}_{nnodes}')
+os.makedirs(directory)
+df = pd.read_csv('other/darknet/all.csv')
+df.dropna(axis=0,inplace=True)
+df = df.reset_index(drop=True)
+df['Label'] = df['Label'].apply(_to_binary_classification)
+columns_to_exclude = ['Flow ID', 'Src IP','Src Port', 'Dst IP','Dst Port', 'Timestamp','Label','Label.1','Protocol','Flow Duration']
+columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude]
+scaler = MinMaxScaler()
+df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
+i=0
+j=0
+while i< df.shape[0]:
+    print('snapshot:',j)
+    if df.shape[0] > nnodes:
+        data = df.iloc[:nnodes,:].copy()
+        adj = _filling_adjacency_numpy(data)
+        labels = data['Label'].to_numpy()
+        data.drop(columns_to_exclude, axis=1, inplace=True)
+        features = data.to_numpy()
+        save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
+        j+=1
+        i+=nnodes
+        df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True)
+        df = df.reset_index(drop=True)
+        print(np.sum(labels),len(labels)-np.sum(labels))
+    else:
+        adj = _filling_adjacency_numpy(df)
+        labels = data['Label'].to_numpy()
+        data.drop(columns_to_exclude, axis=1, inplace=True)
+        features = data.to_numpy()
+        save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
+        j+=1
+        i+=df.shape[0]
--- a/OTHER/snapshot_generation/ddos2019_snapshots_generation.py
+++ b/OTHER/snapshot_generation/ddos2019_snapshots_generation.py
+# NOTE: This file generates the snapshots from the ddos2019 dataset. It does everything starting by the cleaning and moving to data spliting.
+import os
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import pickle
+def _filling_adjacency_numpy(data):
+    N = data.shape[0]
+    try:
+        adjacency = np.zeros((N,N), dtype=bool)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    source_ips = data[' Source IP'].to_numpy()
+    destination_ips = data[' Destination IP'].to_numpy()
+    mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) )
+    adjacency[mask] = True
+    return adjacency
+def save_samples(adj,features,labels,adj_path,features_path,labels_path):
+        with open(adj_path,'wb') as f:
+            pickle.dump(adj,f)
+        with open(features_path,'wb') as f:
+            pickle.dump(features,f)
+        with open(labels_path,'wb') as f:
+            pickle.dump(labels,f)
+nnodes = 1000
+overlap = 0.25
+directory = os.path.join(os.getcwd(),f'ddos2019_snapshots_{int(overlap*100)}_{nnodes}')
+os.makedirs(directory)
+df = pd.read_csv('other/ddos/all.csv')
+df.dropna(axis=0,inplace=True)
+df = df.reset_index(drop=True)
+for column in df.columns:
+    max_value = df.loc[df[column] != np.inf, column].max()
+    min_value = df.loc[df[column] != -np.inf, column].min()
+    df.loc[df[column] == np.inf, column] = max_value
+    df.loc[df[column] == -np.inf, column] = min_value
+columns_to_exclude = ['Flow ID', ' Source IP',' Source Port',' Destination Port',' Flow Duration',' Protocol', ' Destination IP', ' Timestamp', 'SimillarHTTP',' Inbound',' Label']
+columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude]
+scaler = MinMaxScaler()
+df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
+i=0
+j=0
+while i< df.shape[0]:
+    print('snapshot:',j)
+    if df.shape[0] > nnodes:
+        data = df.iloc[:nnodes,:].copy()
+        adj = _filling_adjacency_numpy(data)
+        labels = data[' Label'].to_numpy()
+        data.drop(columns_to_exclude, axis=1, inplace=True)
+        features = data.to_numpy()
+        save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
+        j+=1
+        i+=nnodes
+        df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True)
+        df = df.reset_index(drop=True)
+        print(np.sum(labels),len(labels)-np.sum(labels))
+    else:
+        adj = _filling_adjacency_numpy(df)
+        labels = data[' Label'].to_numpy()
+        data.drop(columns_to_exclude, axis=1, inplace=True)
+        features = data.to_numpy()
+        save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
+        j+=1
+        i+=df.shape[0]
--- a/OTHER/snapshot_generation/group_attack.py
+++ b/OTHER/snapshot_generation/group_attack.py
+# NOTE: here we have only the script for the TON_IOT dataset but the rest of the dataset follow the same logic
+# It startes by extract the classes of attacks and the normal behaviour and then sort them by class and then randomly select some raws from a randomly selected category
+# the rows are sorted by the timestamp 
+import pandas as pd
+import os
+import random
+def _to_binary_classification(x):
+    if 'Non' in x:
+        return 0
+    else:
+        return 1
+backdoor = pd.read_csv(os.path.join('ton_iot','ton_iot_backdoor.csv'))
+ddos = pd.read_csv(os.path.join('ton_iot','ton_iot_ddos.csv'))
+dos = pd.read_csv(os.path.join('ton_iot','ton_iot_dos.csv'))
+mitm = pd.read_csv(os.path.join('ton_iot','ton_iot_mitm.csv'))
+password = pd.read_csv(os.path.join('ton_iot','ton_iot_password.csv'))
+ransomware = pd.read_csv(os.path.join('ton_iot','ton_iot_ransomware.csv'))
+scanning = pd.read_csv(os.path.join('ton_iot','ton_iot_scanning.csv'))
+xss = pd.read_csv(os.path.join('ton_iot','ton_iot_xss.csv'))
+injection = pd.read_csv(os.path.join('ton_iot','ton_iot_injection.csv'))
+print('backdoor',backdoor.shape[0])
+print('ddos',ddos.shape[0])
+print('dos',dos.shape[0])
+print('mitm',mitm.shape[0])
+print('password',password.shape[0])
+print('ransomware',ransomware.shape[0])
+print('scanning',scanning.shape[0])
+print('xss',xss.shape[0])
+print('injection',injection.shape[0])
+m = backdoor.shape[0] + ddos.shape[0] + dos.shape[0] + mitm.shape[0] + password.shape[0] + ransomware.shape[0] + scanning.shape[0] + xss.shape[0] + injection.shape[0]
+files = os.listdir(directory)
+normal = pd.read_csv('ton_iot/normal.csv')
+attack = pd.read_csv('ton_iot/attacks.csv')
+m = normal.shape[0] + attack.shape[0]
+all = pd.DataFrame()
+i=0
+while i< m:
+    print(i,"-",m)
+    if normal.shape[0]>0:
+        k = random.randint(10,20)
+        if normal.shape[0] >k:
+            all = pd.concat([all,normal.iloc[:k,:]],axis=0)
+            normal.drop(range(k),axis=0,inplace=True)
+            normal = normal.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,normal],axis=0)
+            i+=normal.shape[0]
+            normal = pd.DataFrame()
+    if attack.shape[0]>0:
+        k = random.randint(10,20)
+        if attack.shape[0] >k:
+            all = pd.concat([all,attack.iloc[:k,:]],axis=0)
+            attack.drop(range(k),axis=0,inplace=True)
+            attack = attack.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,attack],axis=0)
+            i+=attack.shape[0]
+            attack = pd.DataFrame()
+all.to_csv('ton_iot/all.csv',index=False)
+all = pd.DataFrame()   
+i = 0
+while i < m:
+    print(i,"/",m)
+    if backdoor.shape[0] >0:
+        k = random.randint(1,10)
+        if(backdoor.shape[0] >k):
+            all = pd.concat([all,backdoor.iloc[:k,:]],axis=0)
+            backdoor.drop(range(k),axis=0,inplace=True)
+            backdoor = backdoor.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,backdoor],axis=0)
+            i+=backdoor.shape[0]
+            backdoor = pd.DataFrame()
+    if ddos.shape[0] > 0:
+        k = random.randint(1,10)
+        if(ddos.shape[0] >k):
+            all = pd.concat([all,ddos.iloc[:k,:]],axis=0)
+            ddos.drop(list(range(k)),axis=0,inplace=True)
+            ddos = ddos.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,ddos],axis=0)
+            i+=ddos.shape[0]
+            ddos = pd.DataFrame()
+    if dos.shape[0] > 0:
+        k = random.randint(1,10)
+        if(dos.shape[0] >k):
+            all = pd.concat([all,dos.iloc[:k,:]],axis=0)
+            dos.drop(list(range(k)),axis=0,inplace=True)
+            dos = dos.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,dos],axis=0)
+            i+=dos.shape[0]
+            dos = pd.DataFrame()
+    if mitm.shape[0] > 0:
+        k = random.randint(1,10)
+        if(mitm.shape[0] >k):
+            all = pd.concat([all,mitm.iloc[:k,:]],axis=0)
+            mitm.drop(list(range(k)),axis=0,inplace=True)
+            mitm = mitm.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,mitm],axis=0)
+            i+=mitm.shape[0]
+            mitm = pd.DataFrame()
+    if password.shape[0] > 0:
+        k = random.randint(1,10)
+        if(password.shape[0] >k):
+            all = pd.concat([all,password.iloc[:k,:]],axis=0)
+            password.drop(list(range(k)),axis=0,inplace=True)
+            password = password.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,password],axis=0)
+            i+=password.shape[0]
+            password = pd.DataFrame()
+    if ransomware.shape[0] > 0:
+        k = random.randint(1,10)
+        if(ransomware.shape[0] >k):
+            all = pd.concat([all,ransomware.iloc[:k,:]],axis=0)
+            ransomware.drop(list(range(k)),axis=0,inplace=True)
+            ransomware = ransomware.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,ransomware],axis=0)
+            i+=ransomware.shape[0]
+            ransomware = pd.DataFrame()
+    if scanning.shape[0] > 0:
+        k = random.randint(1,10)
+        if(scanning.shape[0] >k):
+            all = pd.concat([all,scanning.iloc[:k,:]],axis=0)
+            scanning.drop(list(range(k)),axis=0,inplace=True)
+            scanning = scanning.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,scanning],axis=0)
+            i+=scanning.shape[0]
+            scanning = pd.DataFrame()
+    if xss.shape[0] > 0:
+        k = random.randint(1,10)
+        if(xss.shape[0] >k):
+            all = pd.concat([all,xss.iloc[:k,:]],axis=0)
+            xss.drop(list(range(k)),axis=0,inplace=True)
+            xss = xss.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,xss],axis=0)
+            i+=xss.shape[0]
+            xss = pd.DataFrame()
+    if injection.shape[0] > 0:
+        k = random.randint(1,10)
+        if(injection.shape[0] >k):
+            all = pd.concat([all,injection.iloc[:k,:]],axis=0)
+            injection.drop(list(range(k)),axis=0,inplace=True)
+            injection = injection.reset_index(drop=True)
+            i+=k
+        else:
+            all = pd.concat([all,injection],axis=0)
+            i+=injection.shape[0]
+            injection = pd.DataFrame()
+all.to_csv('ton_iot/attacks.csv',index=False)
\ No newline at end of file
--- a/OTHER/snapshot_generation/group_normal_attack.py
+++ b/OTHER/snapshot_generation/group_normal_attack.py
+# NOTE: same logic with group_attack.py file it randomly select raws from normal or attacks while keeping the order inside each class
+import pandas as pd
+import os
+import numpy as np
+# file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','all.csv')
+file_normal = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','normal.csv')
+file_attack = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','attack.csv')
+for_snapshotting = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','for_snapshotting.csv')
+# df = pd.read_csv(file,low_memory=False)
+# print('>> shape:',df.shape)
+# df.dropna(axis=0, inplace=True)
+# normal = df[df[' Label'] == 'BENIGN']
+# print('>> normal shape:',normal.shape)
+# normal.to_csv(file_normal,index=False)
+# attack = df[df[' Label'] != 'BENIGN']
+# print('>> attack shape:',attack.shape)
+# smallest = min(attack.shape[0],normal.shape[0])
+# if normal.shape[0] <= attack.shape[0]:
+#     normal = normal.sort_values(by=' Timestamp')
+#     normal.to_csv(file_normal)
+#     attack = attack.sample(n=normal.shape[0]).reset_index(drop=True)
+#     attack = attack.sort_values(by=' Timestamp')
+#     attack.to_csv(file_attack)
+# else:
+#     attack = attack.sort_values(by=' Timestamp')
+#     attack.to_csv(file_attack)
+#     normal = normal.sample(n=attack.shape[0]).reset_index(drop=True)
+#     normal = normal.sort_values(by=' Timestamp')
+#     normal.to_csv(file_normal)
+df = pd.DataFrame()
+normal = pd.read_csv(file_normal,low_memory=False)
+attack = pd.read_csv(file_attack,low_memory=False)
+attack[' Label'] = 1
+normal[' Label'] = 0
+nnodes = normal.shape[0]
+i,j=0,0
+stop =False
+while not stop:
+    if i < nnodes:
+        k = np.random.randint(1,20)
+        if i+k > nnodes:
+            k = nnodes - i
+        print('Normal: [{},{}]'.format(i,i+k))
+        df = pd.concat([df,normal.iloc[i:i+k,:]],ignore_index=True)
+        i+=k
+    if j < nnodes:
+        k = np.random.randint(1,20)
+        if j+k > nnodes:
+            k = nnodes - j
+        print('Attack: [{},{}]'.format(j,j+k))
+        df = pd.concat([df,attack.iloc[j:j+k,:]],ignore_index=True)
+        j+=k
+    if i == j == nnodes:
+        stop = True
+df.to_csv(for_snapshotting,index=False)
--- a/OTHER/snapshot_generation/ton_iot_snapshots_generation.py
+++ b/OTHER/snapshot_generation/ton_iot_snapshots_generation.py
+# NOTE: This file generates the snapshots from the ton_iot dataset. It does everything starting by the cleaning and moving to data spliting.
+import os
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import pickle
+def _filling_adjacency_numpy(data):
+    N = data.shape[0]
+    try:
+        adjacency = np.zeros((N,N), dtype=bool)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    source_ips = data['Src IP'].to_numpy()
+    destination_ips = data['Dst IP'].to_numpy()
+    mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) )
+    adjacency[mask] = True
+    return adjacency
+def save_samples(adj,features,labels,adj_path,features_path,labels_path):
+        with open(adj_path,'wb') as f:
+            pickle.dump(adj,f)
+        with open(features_path,'wb') as f:
+            pickle.dump(features,f)
+        with open(labels_path,'wb') as f:
+            pickle.dump(labels,f)
+nnodes = 1000
+overlap = 0.25
+directory = os.path.join(os.getcwd(),f'ton_iot_snapshots_{int(overlap*100)}_{nnodes}')
+os.makedirs(directory)
+df = pd.read_csv('other/ton_iot/all.csv')
+df.dropna(axis=0,inplace=True)
+df = df.reset_index(drop=True)
+columns_to_exclude = ['Src IP','Dst IP','Label']
+columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude]
+scaler = MinMaxScaler()
+df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
+i=0
+j=0
+while i< df.shape[0]:
+    print('snapshot:',j)
+    if df.shape[0] > nnodes:
+        data = df.iloc[:nnodes,:].copy()
+        adj = _filling_adjacency_numpy(data)
+        labels = data['Label'].to_numpy()
+        data.drop(columns_to_exclude, axis=1, inplace=True)
+        features = data.to_numpy()
+        save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
+        j+=1
+        i+=nnodes
+        df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True)
+        df = df.reset_index(drop=True)
+        print(np.sum(labels),len(labels)-np.sum(labels))
+    else:
+        adj = _filling_adjacency_numpy(df)
+        labels = data['Label'].to_numpy()
+        data.drop(columns_to_exclude, axis=1, inplace=True)
+        features = data.to_numpy()
+        save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
+        j+=1
+        i+=df.shape[0]