diff --git a/OTHER/anomaly_detector_selection/selecting_classifier_for_pvae.ipynb b/OTHER/anomaly_detector_selection/selecting_classifier_for_pvae.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..30ffe8f23f22abcddc9e55db7c5eefe51310b123 --- /dev/null +++ b/OTHER/anomaly_detector_selection/selecting_classifier_for_pvae.ipynb @@ -0,0 +1,1272 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8620fcf3", + "metadata": {}, + "outputs": [], + "source": [ + "from Ghypeddings import *\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f9f909f0", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score , f1_score , recall_score , precision_score , roc_auc_score" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2ffdf671", + "metadata": {}, + "outputs": [], + "source": [ + "data = np.genfromtxt('pvae_embeddings_euc.csv', delimiter=',')\n", + "X = data[:,:-1]\n", + "y = data[:,-1]\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0024ea51", + "metadata": {}, + "outputs": [], + "source": [ + "def best_score_params(estimator,params):\n", + " grid_search = GridSearchCV(estimator=estimator, param_grid=params, cv=5)\n", + " grid_search.fit(X_train, y_train)\n", + " best_params = grid_search.best_params_\n", + " best_model = grid_search.best_estimator_\n", + " y_pred = best_model.predict(X_test)\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " f1 = f1_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred)\n", + " roc_auc = roc_auc_score(y_test, y_pred)\n", + " print(accuracy,f1,recall,precision,roc_auc)\n", + " cv_results = grid_search.cv_results_\n", + " df = pd.DataFrame(cv_results)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "da99d4c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9646666666666667 0.9640677966101695 0.9543624161073826 0.9739726027397261 0.9645984265967377\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mean_fit_time</th>\n", + " <th>std_fit_time</th>\n", + " <th>mean_score_time</th>\n", + " <th>std_score_time</th>\n", + " <th>param_n_estimators</th>\n", + " <th>params</th>\n", + " <th>split0_test_score</th>\n", + " <th>split1_test_score</th>\n", + " <th>split2_test_score</th>\n", + " <th>split3_test_score</th>\n", + " <th>split4_test_score</th>\n", + " <th>mean_test_score</th>\n", + " <th>std_test_score</th>\n", + " <th>rank_test_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.009738</td>\n", + " <td>0.006191</td>\n", + " <td>0.003436</td>\n", + " <td>0.005884</td>\n", + " <td>2</td>\n", + " <td>{'n_estimators': 2}</td>\n", + " <td>0.825714</td>\n", + " <td>0.848571</td>\n", + " <td>0.842857</td>\n", + " <td>0.814286</td>\n", + " <td>0.828571</td>\n", + " <td>0.832000</td>\n", + " <td>0.012309</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.029227</td>\n", + " <td>0.009372</td>\n", + " <td>0.001003</td>\n", + " <td>0.000896</td>\n", + " <td>5</td>\n", + " <td>{'n_estimators': 5}</td>\n", + " <td>0.902857</td>\n", + " <td>0.932857</td>\n", + " <td>0.917143</td>\n", + " <td>0.902857</td>\n", + " <td>0.924286</td>\n", + " <td>0.916000</td>\n", + " <td>0.011829</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.057288</td>\n", + " <td>0.005125</td>\n", + " <td>0.001593</td>\n", + " <td>0.000797</td>\n", + " <td>10</td>\n", + " <td>{'n_estimators': 10}</td>\n", + " <td>0.940000</td>\n", + " <td>0.964286</td>\n", + " <td>0.958571</td>\n", + " <td>0.925714</td>\n", + " <td>0.918571</td>\n", + " <td>0.941429</td>\n", + " <td>0.017820</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.111675</td>\n", + " <td>0.001813</td>\n", + " <td>0.005273</td>\n", + " <td>0.005573</td>\n", + " <td>20</td>\n", + " <td>{'n_estimators': 20}</td>\n", + " <td>0.967143</td>\n", + " <td>0.975714</td>\n", + " <td>0.972857</td>\n", + " <td>0.967143</td>\n", + " <td>0.968571</td>\n", + " <td>0.970286</td>\n", + " <td>0.003429</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.009738 0.006191 0.003436 0.005884 \n", + "1 0.029227 0.009372 0.001003 0.000896 \n", + "2 0.057288 0.005125 0.001593 0.000797 \n", + "3 0.111675 0.001813 0.005273 0.005573 \n", + "\n", + " param_n_estimators params split0_test_score \\\n", + "0 2 {'n_estimators': 2} 0.825714 \n", + "1 5 {'n_estimators': 5} 0.902857 \n", + "2 10 {'n_estimators': 10} 0.940000 \n", + "3 20 {'n_estimators': 20} 0.967143 \n", + "\n", + " split1_test_score split2_test_score split3_test_score split4_test_score \\\n", + "0 0.848571 0.842857 0.814286 0.828571 \n", + "1 0.932857 0.917143 0.902857 0.924286 \n", + "2 0.964286 0.958571 0.925714 0.918571 \n", + "3 0.975714 0.972857 0.967143 0.968571 \n", + "\n", + " mean_test_score std_test_score rank_test_score \n", + "0 0.832000 0.012309 4 \n", + "1 0.916000 0.011829 3 \n", + "2 0.941429 0.017820 2 \n", + "3 0.970286 0.003429 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import AdaBoostClassifier\n", + "params = {\n", + " 'n_estimators' : [2,5,10,20]\n", + "}\n", + "estimator = AdaBoostClassifier()\n", + "best_score_params(estimator,params)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "54cd340d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.972 0.9718875502008033 0.974496644295302 0.9692923898531375 0.9720165340681809\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mean_fit_time</th>\n", + " <th>std_fit_time</th>\n", + " <th>mean_score_time</th>\n", + " <th>std_score_time</th>\n", + " <th>param_max_depth</th>\n", + " <th>params</th>\n", + " <th>split0_test_score</th>\n", + " <th>split1_test_score</th>\n", + " <th>split2_test_score</th>\n", + " <th>split3_test_score</th>\n", + " <th>split4_test_score</th>\n", + " <th>mean_test_score</th>\n", + " <th>std_test_score</th>\n", + " <th>rank_test_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.008183</td>\n", + " <td>0.005421</td>\n", + " <td>0.000511</td>\n", + " <td>0.000448</td>\n", + " <td>2</td>\n", + " <td>{'max_depth': 2}</td>\n", + " <td>0.890000</td>\n", + " <td>0.914286</td>\n", + " <td>0.904286</td>\n", + " <td>0.862857</td>\n", + " <td>0.881429</td>\n", + " <td>0.890571</td>\n", + " <td>0.017902</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.011581</td>\n", + " <td>0.006063</td>\n", + " <td>0.000200</td>\n", + " <td>0.000400</td>\n", + " <td>3</td>\n", + " <td>{'max_depth': 3}</td>\n", + " <td>0.902857</td>\n", + " <td>0.924286</td>\n", + " <td>0.931429</td>\n", + " <td>0.921429</td>\n", + " <td>0.911429</td>\n", + " <td>0.918286</td>\n", + " <td>0.010037</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.009374</td>\n", + " <td>0.007654</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>4</td>\n", + " <td>{'max_depth': 4}</td>\n", + " <td>0.935714</td>\n", + " <td>0.942857</td>\n", + " <td>0.950000</td>\n", + " <td>0.941429</td>\n", + " <td>0.935714</td>\n", + " <td>0.941143</td>\n", + " <td>0.005299</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.014813</td>\n", + " <td>0.005525</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>5</td>\n", + " <td>{'max_depth': 5}</td>\n", + " <td>0.948571</td>\n", + " <td>0.972857</td>\n", + " <td>0.967143</td>\n", + " <td>0.960000</td>\n", + " <td>0.968571</td>\n", + " <td>0.963429</td>\n", + " <td>0.008505</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.019709</td>\n", + " <td>0.006017</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>10</td>\n", + " <td>{'max_depth': 10}</td>\n", + " <td>0.961429</td>\n", + " <td>0.971429</td>\n", + " <td>0.982857</td>\n", + " <td>0.972857</td>\n", + " <td>0.971429</td>\n", + " <td>0.972000</td>\n", + " <td>0.006797</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.008183 0.005421 0.000511 0.000448 \n", + "1 0.011581 0.006063 0.000200 0.000400 \n", + "2 0.009374 0.007654 0.000000 0.000000 \n", + "3 0.014813 0.005525 0.000000 0.000000 \n", + "4 0.019709 0.006017 0.000000 0.000000 \n", + "\n", + " param_max_depth params split0_test_score split1_test_score \\\n", + "0 2 {'max_depth': 2} 0.890000 0.914286 \n", + "1 3 {'max_depth': 3} 0.902857 0.924286 \n", + "2 4 {'max_depth': 4} 0.935714 0.942857 \n", + "3 5 {'max_depth': 5} 0.948571 0.972857 \n", + "4 10 {'max_depth': 10} 0.961429 0.971429 \n", + "\n", + " split2_test_score split3_test_score split4_test_score mean_test_score \\\n", + "0 0.904286 0.862857 0.881429 0.890571 \n", + "1 0.931429 0.921429 0.911429 0.918286 \n", + "2 0.950000 0.941429 0.935714 0.941143 \n", + "3 0.967143 0.960000 0.968571 0.963429 \n", + "4 0.982857 0.972857 0.971429 0.972000 \n", + "\n", + " std_test_score rank_test_score \n", + "0 0.017902 5 \n", + "1 0.010037 4 \n", + "2 0.005299 3 \n", + "3 0.008505 2 \n", + "4 0.006797 1 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "params = {\n", + " 'max_depth' : [2,3,4,5,10]\n", + "}\n", + "estimator = DecisionTreeClassifier()\n", + "best_score_params(estimator,params)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f63108dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9706666666666667 0.9707446808510638 0.9798657718120806 0.9617918313570487 0.9707275878927952\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mean_fit_time</th>\n", + " <th>std_fit_time</th>\n", + " <th>mean_score_time</th>\n", + " <th>std_score_time</th>\n", + " <th>param_n_neighbors</th>\n", + " <th>params</th>\n", + " <th>split0_test_score</th>\n", + " <th>split1_test_score</th>\n", + " <th>split2_test_score</th>\n", + " <th>split3_test_score</th>\n", + " <th>split4_test_score</th>\n", + " <th>mean_test_score</th>\n", + " <th>std_test_score</th>\n", + " <th>rank_test_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.0004</td>\n", + " <td>0.00049</td>\n", + " <td>0.056837</td>\n", + " <td>0.049092</td>\n", + " <td>2</td>\n", + " <td>{'n_neighbors': 2}</td>\n", + " <td>0.967143</td>\n", + " <td>0.984286</td>\n", + " <td>0.977143</td>\n", + " <td>0.964286</td>\n", + " <td>0.967143</td>\n", + " <td>0.972000</td>\n", + " <td>0.007538</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.0002</td>\n", + " <td>0.00040</td>\n", + " <td>0.036898</td>\n", + " <td>0.004190</td>\n", + " <td>5</td>\n", + " <td>{'n_neighbors': 5}</td>\n", + " <td>0.970000</td>\n", + " <td>0.974286</td>\n", + " <td>0.982857</td>\n", + " <td>0.972857</td>\n", + " <td>0.971429</td>\n", + " <td>0.974286</td>\n", + " <td>0.004518</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.0004</td>\n", + " <td>0.00049</td>\n", + " <td>0.033390</td>\n", + " <td>0.008457</td>\n", + " <td>10</td>\n", + " <td>{'n_neighbors': 10}</td>\n", + " <td>0.964286</td>\n", + " <td>0.975714</td>\n", + " <td>0.975714</td>\n", + " <td>0.965714</td>\n", + " <td>0.971429</td>\n", + " <td>0.970571</td>\n", + " <td>0.004832</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.0000</td>\n", + " <td>0.00000</td>\n", + " <td>0.035005</td>\n", + " <td>0.006923</td>\n", + " <td>15</td>\n", + " <td>{'n_neighbors': 15}</td>\n", + " <td>0.965714</td>\n", + " <td>0.972857</td>\n", + " <td>0.978571</td>\n", + " <td>0.964286</td>\n", + " <td>0.967143</td>\n", + " <td>0.969714</td>\n", + " <td>0.005299</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.0000</td>\n", + " <td>0.00000</td>\n", + " <td>0.040141</td>\n", + " <td>0.007525</td>\n", + " <td>20</td>\n", + " <td>{'n_neighbors': 20}</td>\n", + " <td>0.967143</td>\n", + " <td>0.974286</td>\n", + " <td>0.975714</td>\n", + " <td>0.955714</td>\n", + " <td>0.960000</td>\n", + " <td>0.966571</td>\n", + " <td>0.007804</td>\n", + " <td>5</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.0004 0.00049 0.056837 0.049092 \n", + "1 0.0002 0.00040 0.036898 0.004190 \n", + "2 0.0004 0.00049 0.033390 0.008457 \n", + "3 0.0000 0.00000 0.035005 0.006923 \n", + "4 0.0000 0.00000 0.040141 0.007525 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 2 {'n_neighbors': 2} 0.967143 \n", + "1 5 {'n_neighbors': 5} 0.970000 \n", + "2 10 {'n_neighbors': 10} 0.964286 \n", + "3 15 {'n_neighbors': 15} 0.965714 \n", + "4 20 {'n_neighbors': 20} 0.967143 \n", + "\n", + " split1_test_score split2_test_score split3_test_score split4_test_score \\\n", + "0 0.984286 0.977143 0.964286 0.967143 \n", + "1 0.974286 0.982857 0.972857 0.971429 \n", + "2 0.975714 0.975714 0.965714 0.971429 \n", + "3 0.972857 0.978571 0.964286 0.967143 \n", + "4 0.974286 0.975714 0.955714 0.960000 \n", + "\n", + " mean_test_score std_test_score rank_test_score \n", + "0 0.972000 0.007538 2 \n", + "1 0.974286 0.004518 1 \n", + "2 0.970571 0.004832 3 \n", + "3 0.969714 0.005299 4 \n", + "4 0.966571 0.007804 5 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "params = {\n", + " 'n_neighbors' : [2,5,10,15,20]\n", + "}\n", + "estimator = KNeighborsClassifier()\n", + "best_score_params(estimator,params)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7abc7f6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9453333333333334 0.944743935309973 0.9409395973154362 0.9485791610284168 0.9453042357438108\n" + ] + } + ], + "source": [ + "from sklearn.neural_network import MLPClassifier\n", + "params = {\n", + " 'solver':['adam','sgd','lbfgs'],\n", + " 'activation':['relu','identity','logistic','tanh'],\n", + " 'batch_size':[64],\n", + " 'learning_rate':['adaptive'],\n", + " 'max_iter':[50],\n", + " 'hidden_layer_sizes':[(1,10),(1,20),(1,5),(1,10),(2,10),(2,20),(2,15),(2,5)]\n", + "}\n", + "estimator = MLPClassifier()\n", + "df = best_score_params(estimator,params)\n", + "data = df[df['rank_test_score']==1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "40d8f286", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mean_fit_time</th>\n", + " <th>std_fit_time</th>\n", + " <th>mean_score_time</th>\n", + " <th>std_score_time</th>\n", + " <th>param_activation</th>\n", + " <th>param_batch_size</th>\n", + " <th>param_hidden_layer_sizes</th>\n", + " <th>param_learning_rate</th>\n", + " <th>param_max_iter</th>\n", + " <th>param_solver</th>\n", + " <th>params</th>\n", + " <th>split0_test_score</th>\n", + " <th>split1_test_score</th>\n", + " <th>split2_test_score</th>\n", + " <th>split3_test_score</th>\n", + " <th>split4_test_score</th>\n", + " <th>mean_test_score</th>\n", + " <th>std_test_score</th>\n", + " <th>rank_test_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>89</th>\n", + " <td>0.078499</td>\n", + " <td>0.009308</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>tanh</td>\n", + " <td>64</td>\n", + " <td>(2, 20)</td>\n", + " <td>adaptive</td>\n", + " <td>50</td>\n", + " <td>lbfgs</td>\n", + " <td>{'activation': 'tanh', 'batch_size': 64, 'hidd...</td>\n", + " <td>0.95</td>\n", + " <td>0.971429</td>\n", + " <td>0.965714</td>\n", + " <td>0.951429</td>\n", + " <td>0.948571</td>\n", + " <td>0.957429</td>\n", + " <td>0.00932</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "89 0.078499 0.009308 0.0 0.0 \n", + "\n", + " param_activation param_batch_size param_hidden_layer_sizes \\\n", + "89 tanh 64 (2, 20) \n", + "\n", + " param_learning_rate param_max_iter param_solver \\\n", + "89 adaptive 50 lbfgs \n", + "\n", + " params split0_test_score \\\n", + "89 {'activation': 'tanh', 'batch_size': 64, 'hidd... 0.95 \n", + "\n", + " split1_test_score split2_test_score split3_test_score \\\n", + "89 0.971429 0.965714 0.951429 \n", + "\n", + " split4_test_score mean_test_score std_test_score rank_test_score \n", + "89 0.948571 0.957429 0.00932 1 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2f136115", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7693333333333333 0.7503607503607503 0.697986577181208 0.8112324492979719 0.7688608382594782\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mean_fit_time</th>\n", + " <th>std_fit_time</th>\n", + " <th>mean_score_time</th>\n", + " <th>std_score_time</th>\n", + " <th>params</th>\n", + " <th>split0_test_score</th>\n", + " <th>split1_test_score</th>\n", + " <th>split2_test_score</th>\n", + " <th>split3_test_score</th>\n", + " <th>split4_test_score</th>\n", + " <th>mean_test_score</th>\n", + " <th>std_test_score</th>\n", + " <th>rank_test_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.001931</td>\n", + " <td>0.003862</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>{}</td>\n", + " <td>0.762857</td>\n", + " <td>0.758571</td>\n", + " <td>0.751429</td>\n", + " <td>0.76</td>\n", + " <td>0.748571</td>\n", + " <td>0.756286</td>\n", + " <td>0.005391</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time params \\\n", + "0 0.001931 0.003862 0.0 0.0 {} \n", + "\n", + " split0_test_score split1_test_score split2_test_score split3_test_score \\\n", + "0 0.762857 0.758571 0.751429 0.76 \n", + "\n", + " split4_test_score mean_test_score std_test_score rank_test_score \n", + "0 0.748571 0.756286 0.005391 1 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "params = {\n", + "}\n", + "estimator = GaussianNB()\n", + "best_score_params(estimator,params)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "abb5b48b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9773333333333334 0.9772117962466489 0.978523489932886 0.9759036144578314 0.9773412151651184\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mean_fit_time</th>\n", + " <th>std_fit_time</th>\n", + " <th>mean_score_time</th>\n", + " <th>std_score_time</th>\n", + " <th>param_max_depth</th>\n", + " <th>param_max_features</th>\n", + " <th>param_n_estimators</th>\n", + " <th>params</th>\n", + " <th>split0_test_score</th>\n", + " <th>split1_test_score</th>\n", + " <th>split2_test_score</th>\n", + " <th>split3_test_score</th>\n", + " <th>split4_test_score</th>\n", + " <th>mean_test_score</th>\n", + " <th>std_test_score</th>\n", + " <th>rank_test_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.009363</td>\n", + " <td>0.008160</td>\n", + " <td>0.000200</td>\n", + " <td>0.000399</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " <td>1</td>\n", + " <td>{'max_depth': 2, 'max_features': None, 'n_esti...</td>\n", + " <td>0.890000</td>\n", + " <td>0.914286</td>\n", + " <td>0.918571</td>\n", + " <td>0.878571</td>\n", + " <td>0.875714</td>\n", + " <td>0.895429</td>\n", + " <td>0.017852</td>\n", + " <td>78</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.008879</td>\n", + " <td>0.007268</td>\n", + " <td>0.003329</td>\n", + " <td>0.006170</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " <td>2</td>\n", + " <td>{'max_depth': 2, 'max_features': None, 'n_esti...</td>\n", + " <td>0.911429</td>\n", + " <td>0.911429</td>\n", + " <td>0.904286</td>\n", + " <td>0.877143</td>\n", + " <td>0.881429</td>\n", + " <td>0.897143</td>\n", + " <td>0.014874</td>\n", + " <td>76</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.023364</td>\n", + " <td>0.005673</td>\n", + " <td>0.000503</td>\n", + " <td>0.000447</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " <td>3</td>\n", + " <td>{'max_depth': 2, 'max_features': None, 'n_esti...</td>\n", + " <td>0.900000</td>\n", + " <td>0.928571</td>\n", + " <td>0.914286</td>\n", + " <td>0.862857</td>\n", + " <td>0.902857</td>\n", + " <td>0.901714</td>\n", + " <td>0.021879</td>\n", + " <td>73</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.024603</td>\n", + " <td>0.009061</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " <td>4</td>\n", + " <td>{'max_depth': 2, 'max_features': None, 'n_esti...</td>\n", + " <td>0.908571</td>\n", + " <td>0.912857</td>\n", + " <td>0.904286</td>\n", + " <td>0.862857</td>\n", + " <td>0.908571</td>\n", + " <td>0.899429</td>\n", + " <td>0.018486</td>\n", + " <td>74</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.036898</td>\n", + " <td>0.006984</td>\n", + " <td>0.000201</td>\n", + " <td>0.000401</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " <td>5</td>\n", + " <td>{'max_depth': 2, 'max_features': None, 'n_esti...</td>\n", + " <td>0.890000</td>\n", + " <td>0.921429</td>\n", + " <td>0.904286</td>\n", + " <td>0.862857</td>\n", + " <td>0.895714</td>\n", + " <td>0.894857</td>\n", + " <td>0.019200</td>\n", + " <td>79</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>85</th>\n", + " <td>0.006250</td>\n", + " <td>0.007655</td>\n", + " <td>0.003125</td>\n", + " <td>0.006250</td>\n", + " <td>10</td>\n", + " <td>log2</td>\n", + " <td>2</td>\n", + " <td>{'max_depth': 10, 'max_features': 'log2', 'n_e...</td>\n", + " <td>0.964286</td>\n", + " <td>0.978571</td>\n", + " <td>0.977143</td>\n", + " <td>0.967143</td>\n", + " <td>0.968571</td>\n", + " <td>0.971143</td>\n", + " <td>0.005671</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <th>86</th>\n", + " <td>0.009375</td>\n", + " <td>0.007655</td>\n", + " <td>0.003125</td>\n", + " <td>0.006250</td>\n", + " <td>10</td>\n", + " <td>log2</td>\n", + " <td>3</td>\n", + " <td>{'max_depth': 10, 'max_features': 'log2', 'n_e...</td>\n", + " <td>0.975714</td>\n", + " <td>0.974286</td>\n", + " <td>0.982857</td>\n", + " <td>0.977143</td>\n", + " <td>0.974286</td>\n", + " <td>0.976857</td>\n", + " <td>0.003182</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>87</th>\n", + " <td>0.018233</td>\n", + " <td>0.006401</td>\n", + " <td>0.000401</td>\n", + " <td>0.000491</td>\n", + " <td>10</td>\n", + " <td>log2</td>\n", + " <td>4</td>\n", + " <td>{'max_depth': 10, 'max_features': 'log2', 'n_e...</td>\n", + " <td>0.972857</td>\n", + " <td>0.977143</td>\n", + " <td>0.975714</td>\n", + " <td>0.971429</td>\n", + " <td>0.975714</td>\n", + " <td>0.974571</td>\n", + " <td>0.002100</td>\n", + " <td>13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>88</th>\n", + " <td>0.025054</td>\n", + " <td>0.007694</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>10</td>\n", + " <td>log2</td>\n", + " <td>5</td>\n", + " <td>{'max_depth': 10, 'max_features': 'log2', 'n_e...</td>\n", + " <td>0.975714</td>\n", + " <td>0.978571</td>\n", + " <td>0.984286</td>\n", + " <td>0.977143</td>\n", + " <td>0.980000</td>\n", + " <td>0.979143</td>\n", + " <td>0.002942</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>89</th>\n", + " <td>0.042915</td>\n", + " <td>0.008787</td>\n", + " <td>0.000626</td>\n", + " <td>0.000767</td>\n", + " <td>10</td>\n", + " <td>log2</td>\n", + " <td>10</td>\n", + " <td>{'max_depth': 10, 'max_features': 'log2', 'n_e...</td>\n", + " <td>0.975714</td>\n", + " <td>0.980000</td>\n", + " <td>0.982857</td>\n", + " <td>0.980000</td>\n", + " <td>0.981429</td>\n", + " <td>0.980000</td>\n", + " <td>0.002390</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>90 rows × 16 columns</p>\n", + "</div>" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.009363 0.008160 0.000200 0.000399 \n", + "1 0.008879 0.007268 0.003329 0.006170 \n", + "2 0.023364 0.005673 0.000503 0.000447 \n", + "3 0.024603 0.009061 0.000000 0.000000 \n", + "4 0.036898 0.006984 0.000201 0.000401 \n", + ".. ... ... ... ... \n", + "85 0.006250 0.007655 0.003125 0.006250 \n", + "86 0.009375 0.007655 0.003125 0.006250 \n", + "87 0.018233 0.006401 0.000401 0.000491 \n", + "88 0.025054 0.007694 0.000000 0.000000 \n", + "89 0.042915 0.008787 0.000626 0.000767 \n", + "\n", + " param_max_depth param_max_features param_n_estimators \\\n", + "0 2 None 1 \n", + "1 2 None 2 \n", + "2 2 None 3 \n", + "3 2 None 4 \n", + "4 2 None 5 \n", + ".. ... ... ... \n", + "85 10 log2 2 \n", + "86 10 log2 3 \n", + "87 10 log2 4 \n", + "88 10 log2 5 \n", + "89 10 log2 10 \n", + "\n", + " params split0_test_score \\\n", + "0 {'max_depth': 2, 'max_features': None, 'n_esti... 0.890000 \n", + "1 {'max_depth': 2, 'max_features': None, 'n_esti... 0.911429 \n", + "2 {'max_depth': 2, 'max_features': None, 'n_esti... 0.900000 \n", + "3 {'max_depth': 2, 'max_features': None, 'n_esti... 0.908571 \n", + "4 {'max_depth': 2, 'max_features': None, 'n_esti... 0.890000 \n", + ".. ... ... \n", + "85 {'max_depth': 10, 'max_features': 'log2', 'n_e... 0.964286 \n", + "86 {'max_depth': 10, 'max_features': 'log2', 'n_e... 0.975714 \n", + "87 {'max_depth': 10, 'max_features': 'log2', 'n_e... 0.972857 \n", + "88 {'max_depth': 10, 'max_features': 'log2', 'n_e... 0.975714 \n", + "89 {'max_depth': 10, 'max_features': 'log2', 'n_e... 0.975714 \n", + "\n", + " split1_test_score split2_test_score split3_test_score \\\n", + "0 0.914286 0.918571 0.878571 \n", + "1 0.911429 0.904286 0.877143 \n", + "2 0.928571 0.914286 0.862857 \n", + "3 0.912857 0.904286 0.862857 \n", + "4 0.921429 0.904286 0.862857 \n", + ".. ... ... ... \n", + "85 0.978571 0.977143 0.967143 \n", + "86 0.974286 0.982857 0.977143 \n", + "87 0.977143 0.975714 0.971429 \n", + "88 0.978571 0.984286 0.977143 \n", + "89 0.980000 0.982857 0.980000 \n", + "\n", + " split4_test_score mean_test_score std_test_score rank_test_score \n", + "0 0.875714 0.895429 0.017852 78 \n", + "1 0.881429 0.897143 0.014874 76 \n", + "2 0.902857 0.901714 0.021879 73 \n", + "3 0.908571 0.899429 0.018486 74 \n", + "4 0.895714 0.894857 0.019200 79 \n", + ".. ... ... ... ... \n", + "85 0.968571 0.971143 0.005671 20 \n", + "86 0.974286 0.976857 0.003182 9 \n", + "87 0.975714 0.974571 0.002100 13 \n", + "88 0.980000 0.979143 0.002942 2 \n", + "89 0.981429 0.980000 0.002390 1 \n", + "\n", + "[90 rows x 16 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "params = {\n", + " 'max_features' : [None,'sqrt','log2'],\n", + " 'n_estimators' : [1,2,3,4,5,10],\n", + " 'max_depth': [2,3,4,5,10]\n", + "}\n", + "estimator = RandomForestClassifier()\n", + "best_score_params(estimator,params)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "78c516c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.956 0.9550408719346049 0.9409395973154362 0.9695712309820194 0.955900262233877\n" + ] + } + ], + "source": [ + "from sklearn import svm\n", + "params = {\n", + " 'kernel' : ['rbf'],\n", + " 'gamma':['scale'],\n", + " 'C':[1,.5],\n", + " 'degree':[3,4]\n", + "}\n", + "estimator = svm.SVC()\n", + "df = best_score_params(estimator,params)\n", + "data = df[df['rank_test_score']==1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4480a184", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c82bc57", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63936e9f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2285fedf", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf623aeb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9655ecd0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/OTHER/anomaly_detector_selection/selecting_outlier_detection_for_pvae.ipynb b/OTHER/anomaly_detector_selection/selecting_outlier_detection_for_pvae.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..defcb6478982175fb713b05bc0525073f3aae90a --- /dev/null +++ b/OTHER/anomaly_detector_selection/selecting_outlier_detection_for_pvae.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8620fcf3", + "metadata": {}, + "outputs": [], + "source": [ + "from Ghypeddings import *\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f9f909f0", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score , f1_score , recall_score , precision_score , roc_auc_score" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "2ffdf671", + "metadata": {}, + "outputs": [], + "source": [ + "data = np.genfromtxt('hgcae_unsw_nb_5_embeddings_euc.csv', delimiter=',')\n", + "X = data[:,:-1]\n", + "y = data[:,-1]\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "0024ea51", + "metadata": {}, + "outputs": [], + "source": [ + "def best_score_params(estimator,params):\n", + " grid_search = GridSearchCV(estimator=estimator, param_grid=params, cv=5)\n", + " grid_search.fit(X_train, y_train)\n", + " best_params = grid_search.best_params_\n", + " best_model = grid_search.best_estimator_\n", + " y_pred = best_model.predict(X_test)\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " f1 = f1_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred)\n", + " roc_auc = roc_auc_score(y_test, y_pred)\n", + " print(accuracy,f1,recall,precision,roc_auc)\n", + " cv_results = grid_search.cv_results_\n", + " df = pd.DataFrame(cv_results)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d1333b28", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score\n", + "\n", + "def calculate_metrics(y_true,y_pred):\n", + " acc = accuracy_score(y_true,y_pred)\n", + " f1 = f1_score(y_true,y_pred)\n", + " rec = recall_score(y_true,y_pred)\n", + " pre = precision_score(y_true,y_pred)\n", + " roc = roc_auc_score(y_true,y_pred)\n", + " return acc,f1,rec,pre,roc" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "efe4163c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2:\n", + "(0.9492, 0.746, 0.746, 0.746, 0.8588888888888888)\n", + "3:\n", + "(0.9288, 0.644, 0.644, 0.644, 0.8022222222222223)\n", + "5:\n", + "(0.9548, 0.774, 0.774, 0.774, 0.8744444444444445)\n", + "10:\n", + "(0.932, 0.66, 0.66, 0.66, 0.8111111111111111)\n", + "20:\n", + "(0.9428, 0.714, 0.714, 0.714, 0.841111111111111)\n", + "40:\n", + "(0.9448, 0.724, 0.724, 0.724, 0.8466666666666667)\n", + "50:\n", + "(0.9412, 0.706, 0.706, 0.706, 0.8366666666666667)\n" + ] + } + ], + "source": [ + "from sklearn.cluster import KMeans\n", + "\n", + "def kmeans(X,y,n_clusters,outlier_percentage=.1):\n", + " model = KMeans(n_clusters=n_clusters)\n", + " model.fit(X)\n", + " y_pred = model.predict(X)\n", + " distances = model.transform(X).min(axis=1)\n", + " threshold = np.percentile(distances, 100 * (1 - outlier_percentage))\n", + " outliers = distances > threshold\n", + " return calculate_metrics(y,outliers)\n", + "print('2:')\n", + "print(kmeans(X,y,2))\n", + "print('3:')\n", + "print(kmeans(X,y,3))\n", + "print('5:')\n", + "print(kmeans(X,y,5))\n", + "print('10:')\n", + "print(kmeans(X,y,10))\n", + "print('20:')\n", + "print(kmeans(X,y,20))\n", + "print('40:')\n", + "print(kmeans(X,y,50))\n", + "print('50:')\n", + "print(kmeans(X,y,50))" + ] + }, + { + "cell_type": "markdown", + "id": "170e9b21", + "metadata": {}, + "source": [ + "# LE reste" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "da99d4c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5:\n", + "(0.8908, 0.024999999999999998, 0.014, 0.11666666666666667, 0.5011111111111112)\n", + "10:\n", + "(0.8926, 0.1408, 0.088, 0.352, 0.535)\n", + "20:\n", + "(0.8868, 0.23097826086956522, 0.17, 0.3601694915254237, 0.5682222222222222)\n", + "50:\n", + "(0.8664, 0.2707423580786026, 0.248, 0.2980769230769231, 0.5915555555555555)\n", + "100:\n", + "(0.789, 0.23606082548877624, 0.326, 0.18501702610669693, 0.5832222222222222)\n" + ] + } + ], + "source": [ + "from sklearn.cluster import DBSCAN\n", + "\n", + "def dbscan(X,y,min_samples=5):\n", + " dbscan = DBSCAN(eps=0.1, min_samples=min_samples)\n", + " labels = dbscan.fit_predict(X)\n", + " outliers = labels == -1\n", + " return calculate_metrics(y,outliers)\n", + "print('5:')\n", + "print(dbscan(X,y,min_samples = 5))\n", + "print('10:')\n", + "print(dbscan(X,y,min_samples = 10))\n", + "print('20:')\n", + "print(dbscan(X,y,min_samples = 20))\n", + "print('50:')\n", + "print(dbscan(X,y,min_samples = 50))\n", + "print('100:')\n", + "print(dbscan(X,y,min_samples = 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "54cd340d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.8586, 0.2922922922922923, 0.292, 0.2925851703406814, 0.6067777777777777)" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import IsolationForest\n", + "\n", + "def isolation_forest(X,y,anomalies_percentage = 0.1):\n", + " model = IsolationForest(contamination=anomalies_percentage)\n", + " model.fit(X)\n", + " y_pred = model.predict(X)\n", + " y_pred[y_pred == 1] = 0\n", + " y_pred[y_pred == -1]= 1\n", + " return calculate_metrics(y,y_pred)\n", + "isolation_forest(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "f63108dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2:\n", + "(0.8486, 0.24072216649949846, 0.24, 0.2414486921529175, 0.5781111111111111)\n", + "3:\n", + "(0.8476, 0.23647294589178355, 0.236, 0.23694779116465864, 0.5757777777777777)\n", + "5:\n", + "(0.8516, 0.258, 0.258, 0.258, 0.5877777777777777)\n", + "10:\n", + "(0.8248, 0.124, 0.124, 0.124, 0.5133333333333334)\n", + "20:\n", + "(0.8352, 0.176, 0.176, 0.176, 0.5422222222222222)\n", + "40:\n", + "(0.84, 0.20000000000000004, 0.2, 0.2, 0.5555555555555555)\n", + "50:\n", + "(0.842, 0.20999999999999996, 0.21, 0.21, 0.5611111111111111)\n" + ] + } + ], + "source": [ + "from sklearn.cluster import KMeans\n", + "\n", + "def kmeans(X,y,n_clusters,outlier_percentage=.1):\n", + " model = KMeans(n_clusters=n_clusters)\n", + " model.fit(X)\n", + " y_pred = model.predict(X)\n", + " distances = model.transform(X).min(axis=1)\n", + " threshold = np.percentile(distances, 100 * (1 - outlier_percentage))\n", + " outliers = distances > threshold\n", + " return calculate_metrics(y,outliers)\n", + "print('2:')\n", + "print(kmeans(X,y,2))\n", + "print('3:')\n", + "print(kmeans(X,y,3))\n", + "print('5:')\n", + "print(kmeans(X,y,5))\n", + "print('10:')\n", + "print(kmeans(X,y,10))\n", + "print('20:')\n", + "print(kmeans(X,y,20))\n", + "print('40:')\n", + "print(kmeans(X,y,50))\n", + "print('50:')\n", + "print(kmeans(X,y,50))" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "7abc7f6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5:\n", + "(0.8272, 0.136, 0.136, 0.136, 0.52)\n", + "10:\n", + "(0.8272, 0.136, 0.136, 0.136, 0.52)\n", + "15:\n", + "(0.8272, 0.136, 0.136, 0.136, 0.52)\n", + "20\n", + "(0.83, 0.15, 0.15, 0.15, 0.5277777777777778)\n", + "30\n", + "(0.83, 0.15, 0.15, 0.15, 0.5277777777777778)\n", + "50\n", + "(0.8572, 0.286, 0.286, 0.286, 0.6033333333333333)\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import LocalOutlierFactor\n", + "\n", + "\n", + "def local_outlier_factor(X,y,n_neighbors=20,outlier_percentage=.1):\n", + " lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=outlier_percentage)\n", + " y_pred = lof.fit_predict(X)\n", + " y_pred[y_pred == 1] = 0\n", + " y_pred[y_pred == -1] = 1\n", + " return calculate_metrics(y,y_pred)\n", + "\n", + "print('5:')\n", + "print(local_outlier_factor(X,y,n_neighbors=5))\n", + "print('10:')\n", + "print(local_outlier_factor(X,y,n_neighbors=5))\n", + "print('15:')\n", + "print(local_outlier_factor(X,y,n_neighbors=5))\n", + "print('20')\n", + "print(local_outlier_factor(X,y,n_neighbors=20))\n", + "print('30')\n", + "print(local_outlier_factor(X,y,n_neighbors=20))\n", + "print('50')\n", + "print(local_outlier_factor(X,y,n_neighbors=50))" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "40d8f286", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rbf:\n", + "(0.7028, 0.23638232271325796, 0.46, 0.1590594744121715, 0.594888888888889)\n", + "linear:\n", + "(0.1028, 0.17868912486268765, 0.976, 0.09834744054816606, 0.4908888888888889)\n", + "poly:\n", + "(0.1, 0.18181818181818182, 1.0, 0.1, 0.5)\n", + "sigmoid:\n", + "(0.424, 0.17383820998278832, 0.606, 0.10147354320160751, 0.5048888888888889)\n" + ] + } + ], + "source": [ + "from sklearn.svm import OneClassSVM\n", + "\n", + "def one_class_svm(X,y, kernel='rbf',nu=0.1):\n", + " model = OneClassSVM(kernel=kernel, nu=nu)\n", + " model.fit(X)\n", + " y_pred = model.predict(X)\n", + " y_pred[y_pred == 1]=0\n", + " y_pred[y_pred == -1] = 1\n", + " return calculate_metrics(y,y_pred)\n", + "\n", + "print('rbf:')\n", + "print(one_class_svm(X,y,kernel='rbf'))\n", + "print('linear:')\n", + "print(one_class_svm(X,y,kernel='linear'))\n", + "print('poly:')\n", + "print(one_class_svm(X,y,kernel='poly'))\n", + "print('sigmoid:')\n", + "print(one_class_svm(X,y,kernel='sigmoid'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f136115", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abb5b48b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78c516c1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4480a184", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6220f3c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b34d26", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/OTHER/anomaly_detector_selection/selection_clustering_algorithm.ipynb b/OTHER/anomaly_detector_selection/selection_clustering_algorithm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..192928ef74253afc150e8d53611d33eec2c94d8d --- /dev/null +++ b/OTHER/anomaly_detector_selection/selection_clustering_algorithm.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "bf96a516", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3399745c", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score\n", + "def calculate_metrics(y_true,y_pred):\n", + " acc = accuracy_score(y_true,y_pred)\n", + " f1 = f1_score(y_true,y_pred)\n", + " rec = recall_score(y_true,y_pred)\n", + " pre = precision_score(y_true,y_pred)\n", + " roc = roc_auc_score(y_true,y_pred)\n", + " return [round(acc,4),round(f1,4),round(rec,4),round(pre,4),round(roc,4)]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f2790afd", + "metadata": {}, + "outputs": [], + "source": [ + "def group_clusters(y_true,y_pred):\n", + " pairs = []\n", + " for k,v in zip(y_true,y_pred):\n", + " tup = (int(k),v)\n", + " pairs.append(tup)\n", + " occurrences = {}\n", + " for item in pairs:\n", + " if item in occurrences:\n", + " occurrences[item] += 1\n", + " else:\n", + " occurrences[item] = 1\n", + " a = sorted(occurrences.items(), key=lambda item: item[1])[::-1]\n", + " normal,attack = [],[]\n", + " for item in a:\n", + " if item[0][1] not in normal and item[0][1] not in attack:\n", + " if item[0][0] == 0:\n", + " normal.append(item[0][1])\n", + " else:\n", + " attack.append(item[0][1])\n", + " for i in normal:\n", + " y_pred[y_pred == i] = 0\n", + " for j in attack:\n", + " y_pred[y_pred == j] = 1\n", + " return y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "49759d76", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import AgglomerativeClustering\n", + "def agglomerative_clustering(X,y,n_clusters = 100):\n", + " model = AgglomerativeClustering(n_clusters=n_clusters)\n", + " labels = model.fit_predict(X)\n", + " labels = group_clusters(y,labels)\n", + " return calculate_metrics(y,labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b4bbd3f7", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import DBSCAN\n", + "def dbscan(X,y,eps=1e-2,min_samples=20):\n", + " model = DBSCAN(eps=eps, min_samples=min_samples)\n", + " y_pred = model.fit_predict(X)\n", + " mask = y_pred != -1\n", + " y_true_filtered = y[mask]\n", + " y_pred_filtered = y_pred[mask]\n", + " y_pred_filtered = group_clusters(y_true_filtered,y_pred_filtered)\n", + " return calculate_metrics(y_true_filtered,y_pred_filtered)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "05d6a66a", + "metadata": {}, + "outputs": [], + "source": [ + "import skfuzzy as fuzz\n", + "def fuzzy_c_mean(X,y,n_clusters=10,power=2,error=0.01,maxiter=1000,init=None):\n", + " X_transposed = np.transpose(X)\n", + " cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X_transposed, n_clusters, power, error=error, maxiter=maxiter, init=init)\n", + " y_pred = np.argmax(u, axis=0)\n", + " y_pred = group_clusters(y,y_pred)\n", + " return calculate_metrics(y,y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0c26e329", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.mixture import GaussianMixture\n", + "def gaussian_mixture(X,y,n_components=20):\n", + " model = GaussianMixture(n_components=n_components)\n", + " y_pred = model.fit_predict(X)\n", + " y_pred = group_clusters(y,y_pred)\n", + " return calculate_metrics(y,y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f4226733", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "def kmeans(X,y,n_clusters=10,n_init=10):\n", + " model = KMeans(n_clusters=n_clusters,n_init=n_init)\n", + " model.fit(X)\n", + " y_pred = model.labels_\n", + " y_pred = group_clusters(y,y_pred)\n", + " return calculate_metrics(y,y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6a07e67b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import MeanShift\n", + "def mean_shift(X,y):\n", + " y_pred = MeanShift(n_jobs=-1,max_iter=10).fit_predict(X)\n", + " y_pred = group_clusters(y,y_pred)\n", + " return calculate_metrics(y,y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "23bac324", + "metadata": {}, + "outputs": [], + "source": [ + "datasets = ['awid3','bot_iot','ddos2019','darknet','ids2018','ton_iot','unsw_nb15']\n", + "models = ['hgcae','pvae']\n", + "dim = 20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15cd3d01", + "metadata": {}, + "outputs": [], + "source": [ + "for model in models:\n", + " for dataset in datasets:\n", + " print(dataset,\"--------------------------\")\n", + " file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')\n", + " data = np.genfromtxt(file, delimiter=\",\", usemask=True)\n", + " X = data[:,:-1]\n", + " y = data[:,-1]\n", + " results =[]\n", + " results.append(agglomerative_clustering(X,y))\n", + " print('agglomerative',results[-1])\n", + " results.append(dbscan(X,y))\n", + " print('dbscan',results[-1])\n", + " results.append(fuzzy_c_mean(X,y))\n", + " print('fuzzy c mean',results[-1])\n", + " results.append(gaussian_mixture(X,y))\n", + " print('gaussian',results[-1])\n", + " results.append(kmeans(X,y))\n", + " print('kmeans',results[-1])\n", + " results.append(mean_shift(X,y))\n", + " print('mean shift',results[-1])\n", + " df = pd.DataFrame(np.array(results))\n", + " df.to_csv(f'{model}_{dataset}.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ab2d406e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dbscan [0.4744, 0.6435, 1.0, 0.4744, 0.5]\n" + ] + } + ], + "source": [ + "dataset = 'ton_iot'\n", + "model = 'hgcae'\n", + "dim = 20\n", + "file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')\n", + "data = np.genfromtxt(file, delimiter=\",\", usemask=True)\n", + "X = data[:,:-1]\n", + "y = data[:,-1]\n", + "results =[]\n", + "# c = [60,70,80]\n", + "# for i in c:\n", + "# results.append(agglomerative_clustering(X,y,n_clusters = i))\n", + "# print('agglomerative',results[-1])\n", + "\n", + "#results.append(dbscan(X,y))\n", + "#print('dbscan',results[-1])\n", + "# c = [2,5,10,20,50,70,100,150,200]\n", + "# for i in c:\n", + "# results.append(fuzzy_c_mean(X,y,n_clusters=i))\n", + "# print('fuzzy c mean',results[-1])\n", + "# c = [2,5,10,20,50,100,200]\n", + "# for i in c:\n", + "# results.append(gaussian_mixture(X,y,n_components=i))\n", + "# print('gaussian',results[-1])\n", + "# c = []\n", + "# results.append(kmeans(X,y))\n", + "# print('kmeans',results[-1])\n", + "# results.append(mean_shift(X,y))\n", + "# print('mean shift',results[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd862da6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7760d5f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da32a2d7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/OTHER/calculate_hyperbolicity.py b/OTHER/calculate_hyperbolicity.py new file mode 100644 index 0000000000000000000000000000000000000000..0eef963000acf0470642d5e487ed51e422f78a54 --- /dev/null +++ b/OTHER/calculate_hyperbolicity.py @@ -0,0 +1,21 @@ +# NOTE: this file calcules the hyperbolicity of all the datasets + + +import numpy as np +from Ghypeddings.datasets.utils import hyperbolicity +from Ghypeddings.datasets.datasets import CIC_DDoS2019,AWID3,NF_CIC_IDS2018_v2,Darknet,NF_TON_IoT_v2,NF_BOT_IoT_v2,NF_UNSW_NB15_v2 + +datasets = [['ddos2019',CIC_DDoS2019], + ['awid3',AWID3], + ['ids2018',NF_CIC_IDS2018_v2], + ['darknet',Darknet], + ['ton_iot',NF_TON_IoT_v2], + ['bot_iot',NF_BOT_IoT_v2], + ['unsw_nb',NF_UNSW_NB15_v2]] +for dataset in datasets: + h_mean = [] + for i in range(5): + adj,_,_ = dataset[1]().load_samples(repetition=0) + h = hyperbolicity(adj,num_samples=10) + h_mean.append(h) + print(dataset[0],np.mean(h_mean)) \ No newline at end of file diff --git a/OTHER/group_all_dataset_files.py b/OTHER/group_all_dataset_files.py new file mode 100644 index 0000000000000000000000000000000000000000..29033f6fd7a01e9658bb2ed81af13898a4c8108b --- /dev/null +++ b/OTHER/group_all_dataset_files.py @@ -0,0 +1,30 @@ +# NOTE: add this .py file inside the folder containing all the files of single dataset and run it +# to concatenate all those files into a single one +# this helps sampling a representative data + +# this is the first OTHER script used after downloading the datasets + + +import os +import pandas as pd + + + + +def run(): + directory = os.getcwd() + files = [f for f in os.listdir(directory) if os.path.isfile(f) and '.py' not in f] + all = [] + for file in files: + df = pd.read_csv(file,low_memory=False) + all.append(df) + + data = pd.concat(all) + # change the name of the dataset here + path = os.path.join(directory,'ton_iot.csv') + data.to_csv(path,index=False) + + + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/OTHER/snapshot_generation/NOTES.txt b/OTHER/snapshot_generation/NOTES.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a2f2ceb2ce1ce264f8b5b5e75f10d9942bca665 --- /dev/null +++ b/OTHER/snapshot_generation/NOTES.txt @@ -0,0 +1,6 @@ +In order to generate representative snapshots we had to reorganize the files of the dataset. +You have to execute those scripts in the following order: + +1- group attack: to group all the attacks into single file +2- group normal attack: group the attack file with the normal file +3- execute one of the three scripts which correspond each to a specific dataset \ No newline at end of file diff --git a/OTHER/snapshot_generation/darknet_snapshots_generation.py b/OTHER/snapshot_generation/darknet_snapshots_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..c35b1138657405f39e53c4cca2fba15f21fb3474 --- /dev/null +++ b/OTHER/snapshot_generation/darknet_snapshots_generation.py @@ -0,0 +1,76 @@ +# NOTE: This file generates the snapshots from the darknet dataset. It does everything starting by the cleaning and moving to data spliting. + + +import os +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +import pickle + + +def _to_binary_classification(x): + if 'Non' in x: + return 0 + else: + return 1 + +def _filling_adjacency_numpy(data): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + + source_ips = data['Src IP'].to_numpy() + destination_ips = data['Dst IP'].to_numpy() + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) ) + adjacency[mask] = True + return adjacency + +def save_samples(adj,features,labels,adj_path,features_path,labels_path): + with open(adj_path,'wb') as f: + pickle.dump(adj,f) + with open(features_path,'wb') as f: + pickle.dump(features,f) + with open(labels_path,'wb') as f: + pickle.dump(labels,f) + +nnodes = 1000 +overlap = 0.25 + +directory = os.path.join(os.getcwd(),f'darknet_snapshots_{int(overlap*100)}_{nnodes}') +os.makedirs(directory) + +df = pd.read_csv('other/darknet/all.csv') +df.dropna(axis=0,inplace=True) +df = df.reset_index(drop=True) +df['Label'] = df['Label'].apply(_to_binary_classification) +columns_to_exclude = ['Flow ID', 'Src IP','Src Port', 'Dst IP','Dst Port', 'Timestamp','Label','Label.1','Protocol','Flow Duration'] +columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude] +scaler = MinMaxScaler() +df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize]) + +i=0 +j=0 +while i< df.shape[0]: + print('snapshot:',j) + if df.shape[0] > nnodes: + data = df.iloc[:nnodes,:].copy() + adj = _filling_adjacency_numpy(data) + labels = data['Label'].to_numpy() + data.drop(columns_to_exclude, axis=1, inplace=True) + features = data.to_numpy() + save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl')) + j+=1 + i+=nnodes + df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True) + df = df.reset_index(drop=True) + print(np.sum(labels),len(labels)-np.sum(labels)) + else: + adj = _filling_adjacency_numpy(df) + labels = data['Label'].to_numpy() + data.drop(columns_to_exclude, axis=1, inplace=True) + features = data.to_numpy() + save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl')) + j+=1 + i+=df.shape[0] diff --git a/OTHER/snapshot_generation/ddos2019_snapshots_generation.py b/OTHER/snapshot_generation/ddos2019_snapshots_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..4c88a46d0f5c9a0b0a217821ada43c792d576ea8 --- /dev/null +++ b/OTHER/snapshot_generation/ddos2019_snapshots_generation.py @@ -0,0 +1,76 @@ +# NOTE: This file generates the snapshots from the ddos2019 dataset. It does everything starting by the cleaning and moving to data spliting. + + + + +import os +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +import pickle + + +def _filling_adjacency_numpy(data): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + + source_ips = data[' Source IP'].to_numpy() + destination_ips = data[' Destination IP'].to_numpy() + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) ) + adjacency[mask] = True + return adjacency + +def save_samples(adj,features,labels,adj_path,features_path,labels_path): + with open(adj_path,'wb') as f: + pickle.dump(adj,f) + with open(features_path,'wb') as f: + pickle.dump(features,f) + with open(labels_path,'wb') as f: + pickle.dump(labels,f) + +nnodes = 1000 +overlap = 0.25 + +directory = os.path.join(os.getcwd(),f'ddos2019_snapshots_{int(overlap*100)}_{nnodes}') +os.makedirs(directory) + +df = pd.read_csv('other/ddos/all.csv') +df.dropna(axis=0,inplace=True) +df = df.reset_index(drop=True) +for column in df.columns: + max_value = df.loc[df[column] != np.inf, column].max() + min_value = df.loc[df[column] != -np.inf, column].min() + df.loc[df[column] == np.inf, column] = max_value + df.loc[df[column] == -np.inf, column] = min_value +columns_to_exclude = ['Flow ID', ' Source IP',' Source Port',' Destination Port',' Flow Duration',' Protocol', ' Destination IP', ' Timestamp', 'SimillarHTTP',' Inbound',' Label'] +columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude] +scaler = MinMaxScaler() +df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize]) + +i=0 +j=0 +while i< df.shape[0]: + print('snapshot:',j) + if df.shape[0] > nnodes: + data = df.iloc[:nnodes,:].copy() + adj = _filling_adjacency_numpy(data) + labels = data[' Label'].to_numpy() + data.drop(columns_to_exclude, axis=1, inplace=True) + features = data.to_numpy() + save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl')) + j+=1 + i+=nnodes + df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True) + df = df.reset_index(drop=True) + print(np.sum(labels),len(labels)-np.sum(labels)) + else: + adj = _filling_adjacency_numpy(df) + labels = data[' Label'].to_numpy() + data.drop(columns_to_exclude, axis=1, inplace=True) + features = data.to_numpy() + save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl')) + j+=1 + i+=df.shape[0] diff --git a/OTHER/snapshot_generation/group_attack.py b/OTHER/snapshot_generation/group_attack.py new file mode 100644 index 0000000000000000000000000000000000000000..13357e3b8e3a9f04ba8f6d95ab128bed99aac89e --- /dev/null +++ b/OTHER/snapshot_generation/group_attack.py @@ -0,0 +1,183 @@ +# NOTE: here we have only the script for the TON_IOT dataset but the rest of the dataset follow the same logic +# It startes by extract the classes of attacks and the normal behaviour and then sort them by class and then randomly select some raws from a randomly selected category +# the rows are sorted by the timestamp + +import pandas as pd +import os +import random + +def _to_binary_classification(x): + if 'Non' in x: + return 0 + else: + return 1 + + +backdoor = pd.read_csv(os.path.join('ton_iot','ton_iot_backdoor.csv')) +ddos = pd.read_csv(os.path.join('ton_iot','ton_iot_ddos.csv')) +dos = pd.read_csv(os.path.join('ton_iot','ton_iot_dos.csv')) +mitm = pd.read_csv(os.path.join('ton_iot','ton_iot_mitm.csv')) +password = pd.read_csv(os.path.join('ton_iot','ton_iot_password.csv')) +ransomware = pd.read_csv(os.path.join('ton_iot','ton_iot_ransomware.csv')) +scanning = pd.read_csv(os.path.join('ton_iot','ton_iot_scanning.csv')) +xss = pd.read_csv(os.path.join('ton_iot','ton_iot_xss.csv')) +injection = pd.read_csv(os.path.join('ton_iot','ton_iot_injection.csv')) +print('backdoor',backdoor.shape[0]) +print('ddos',ddos.shape[0]) +print('dos',dos.shape[0]) +print('mitm',mitm.shape[0]) +print('password',password.shape[0]) +print('ransomware',ransomware.shape[0]) +print('scanning',scanning.shape[0]) +print('xss',xss.shape[0]) +print('injection',injection.shape[0]) +m = backdoor.shape[0] + ddos.shape[0] + dos.shape[0] + mitm.shape[0] + password.shape[0] + ransomware.shape[0] + scanning.shape[0] + xss.shape[0] + injection.shape[0] +files = os.listdir(directory) + +normal = pd.read_csv('ton_iot/normal.csv') +attack = pd.read_csv('ton_iot/attacks.csv') + +m = normal.shape[0] + attack.shape[0] +all = pd.DataFrame() +i=0 +while i< m: + print(i,"-",m) + if normal.shape[0]>0: + k = random.randint(10,20) + if normal.shape[0] >k: + all = pd.concat([all,normal.iloc[:k,:]],axis=0) + normal.drop(range(k),axis=0,inplace=True) + normal = normal.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,normal],axis=0) + i+=normal.shape[0] + normal = pd.DataFrame() + + if attack.shape[0]>0: + k = random.randint(10,20) + if attack.shape[0] >k: + all = pd.concat([all,attack.iloc[:k,:]],axis=0) + attack.drop(range(k),axis=0,inplace=True) + attack = attack.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,attack],axis=0) + i+=attack.shape[0] + attack = pd.DataFrame() + +all.to_csv('ton_iot/all.csv',index=False) + +all = pd.DataFrame() +i = 0 +while i < m: + print(i,"/",m) + if backdoor.shape[0] >0: + k = random.randint(1,10) + if(backdoor.shape[0] >k): + all = pd.concat([all,backdoor.iloc[:k,:]],axis=0) + backdoor.drop(range(k),axis=0,inplace=True) + backdoor = backdoor.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,backdoor],axis=0) + i+=backdoor.shape[0] + backdoor = pd.DataFrame() + + if ddos.shape[0] > 0: + k = random.randint(1,10) + if(ddos.shape[0] >k): + all = pd.concat([all,ddos.iloc[:k,:]],axis=0) + ddos.drop(list(range(k)),axis=0,inplace=True) + ddos = ddos.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,ddos],axis=0) + i+=ddos.shape[0] + ddos = pd.DataFrame() + + if dos.shape[0] > 0: + k = random.randint(1,10) + if(dos.shape[0] >k): + all = pd.concat([all,dos.iloc[:k,:]],axis=0) + dos.drop(list(range(k)),axis=0,inplace=True) + dos = dos.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,dos],axis=0) + i+=dos.shape[0] + dos = pd.DataFrame() + + if mitm.shape[0] > 0: + k = random.randint(1,10) + if(mitm.shape[0] >k): + all = pd.concat([all,mitm.iloc[:k,:]],axis=0) + mitm.drop(list(range(k)),axis=0,inplace=True) + mitm = mitm.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,mitm],axis=0) + i+=mitm.shape[0] + mitm = pd.DataFrame() + + if password.shape[0] > 0: + k = random.randint(1,10) + if(password.shape[0] >k): + all = pd.concat([all,password.iloc[:k,:]],axis=0) + password.drop(list(range(k)),axis=0,inplace=True) + password = password.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,password],axis=0) + i+=password.shape[0] + password = pd.DataFrame() + + if ransomware.shape[0] > 0: + k = random.randint(1,10) + if(ransomware.shape[0] >k): + all = pd.concat([all,ransomware.iloc[:k,:]],axis=0) + ransomware.drop(list(range(k)),axis=0,inplace=True) + ransomware = ransomware.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,ransomware],axis=0) + i+=ransomware.shape[0] + ransomware = pd.DataFrame() + + if scanning.shape[0] > 0: + k = random.randint(1,10) + if(scanning.shape[0] >k): + all = pd.concat([all,scanning.iloc[:k,:]],axis=0) + scanning.drop(list(range(k)),axis=0,inplace=True) + scanning = scanning.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,scanning],axis=0) + i+=scanning.shape[0] + scanning = pd.DataFrame() + + if xss.shape[0] > 0: + k = random.randint(1,10) + if(xss.shape[0] >k): + all = pd.concat([all,xss.iloc[:k,:]],axis=0) + xss.drop(list(range(k)),axis=0,inplace=True) + xss = xss.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,xss],axis=0) + i+=xss.shape[0] + xss = pd.DataFrame() + + if injection.shape[0] > 0: + k = random.randint(1,10) + if(injection.shape[0] >k): + all = pd.concat([all,injection.iloc[:k,:]],axis=0) + injection.drop(list(range(k)),axis=0,inplace=True) + injection = injection.reset_index(drop=True) + i+=k + else: + all = pd.concat([all,injection],axis=0) + i+=injection.shape[0] + injection = pd.DataFrame() + +all.to_csv('ton_iot/attacks.csv',index=False) \ No newline at end of file diff --git a/OTHER/snapshot_generation/group_normal_attack.py b/OTHER/snapshot_generation/group_normal_attack.py new file mode 100644 index 0000000000000000000000000000000000000000..5296279912c0ebc9726140688083069054e82a86 --- /dev/null +++ b/OTHER/snapshot_generation/group_normal_attack.py @@ -0,0 +1,64 @@ +# NOTE: same logic with group_attack.py file it randomly select raws from normal or attacks while keeping the order inside each class + + +import pandas as pd +import os +import numpy as np + + + +# file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','all.csv') +file_normal = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','normal.csv') +file_attack = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','attack.csv') +for_snapshotting = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','for_snapshotting.csv') +# df = pd.read_csv(file,low_memory=False) +# print('>> shape:',df.shape) +# df.dropna(axis=0, inplace=True) +# normal = df[df[' Label'] == 'BENIGN'] +# print('>> normal shape:',normal.shape) +# normal.to_csv(file_normal,index=False) +# attack = df[df[' Label'] != 'BENIGN'] +# print('>> attack shape:',attack.shape) + +# smallest = min(attack.shape[0],normal.shape[0]) + +# if normal.shape[0] <= attack.shape[0]: +# normal = normal.sort_values(by=' Timestamp') +# normal.to_csv(file_normal) +# attack = attack.sample(n=normal.shape[0]).reset_index(drop=True) +# attack = attack.sort_values(by=' Timestamp') +# attack.to_csv(file_attack) +# else: +# attack = attack.sort_values(by=' Timestamp') +# attack.to_csv(file_attack) +# normal = normal.sample(n=attack.shape[0]).reset_index(drop=True) +# normal = normal.sort_values(by=' Timestamp') +# normal.to_csv(file_normal) + +df = pd.DataFrame() +normal = pd.read_csv(file_normal,low_memory=False) +attack = pd.read_csv(file_attack,low_memory=False) +attack[' Label'] = 1 +normal[' Label'] = 0 +nnodes = normal.shape[0] +i,j=0,0 +stop =False +while not stop: + if i < nnodes: + k = np.random.randint(1,20) + if i+k > nnodes: + k = nnodes - i + print('Normal: [{},{}]'.format(i,i+k)) + df = pd.concat([df,normal.iloc[i:i+k,:]],ignore_index=True) + i+=k + if j < nnodes: + k = np.random.randint(1,20) + if j+k > nnodes: + k = nnodes - j + print('Attack: [{},{}]'.format(j,j+k)) + df = pd.concat([df,attack.iloc[j:j+k,:]],ignore_index=True) + j+=k + if i == j == nnodes: + stop = True + +df.to_csv(for_snapshotting,index=False) diff --git a/OTHER/snapshot_generation/ton_iot_snapshots_generation.py b/OTHER/snapshot_generation/ton_iot_snapshots_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..2e696e0da2adf40ab7a89f9a0cff057e9bc52665 --- /dev/null +++ b/OTHER/snapshot_generation/ton_iot_snapshots_generation.py @@ -0,0 +1,70 @@ +# NOTE: This file generates the snapshots from the ton_iot dataset. It does everything starting by the cleaning and moving to data spliting. + + + +import os +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +import pickle + + +def _filling_adjacency_numpy(data): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + + source_ips = data['Src IP'].to_numpy() + destination_ips = data['Dst IP'].to_numpy() + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) ) + adjacency[mask] = True + return adjacency + +def save_samples(adj,features,labels,adj_path,features_path,labels_path): + with open(adj_path,'wb') as f: + pickle.dump(adj,f) + with open(features_path,'wb') as f: + pickle.dump(features,f) + with open(labels_path,'wb') as f: + pickle.dump(labels,f) + +nnodes = 1000 +overlap = 0.25 + +directory = os.path.join(os.getcwd(),f'ton_iot_snapshots_{int(overlap*100)}_{nnodes}') +os.makedirs(directory) + +df = pd.read_csv('other/ton_iot/all.csv') +df.dropna(axis=0,inplace=True) +df = df.reset_index(drop=True) +columns_to_exclude = ['Src IP','Dst IP','Label'] +columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude] +scaler = MinMaxScaler() +df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize]) + +i=0 +j=0 +while i< df.shape[0]: + print('snapshot:',j) + if df.shape[0] > nnodes: + data = df.iloc[:nnodes,:].copy() + adj = _filling_adjacency_numpy(data) + labels = data['Label'].to_numpy() + data.drop(columns_to_exclude, axis=1, inplace=True) + features = data.to_numpy() + save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl')) + j+=1 + i+=nnodes + df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True) + df = df.reset_index(drop=True) + print(np.sum(labels),len(labels)-np.sum(labels)) + else: + adj = _filling_adjacency_numpy(df) + labels = data['Label'].to_numpy() + data.drop(columns_to_exclude, axis=1, inplace=True) + features = data.to_numpy() + save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl')) + j+=1 + i+=df.shape[0]