diff --git a/H2HGCN/manifolds/StiefelManifold.py b/H2HGCN/manifolds/StiefelManifold.py index 42f141a9b76c4d8539b8fd6a6a0a14606f119184..f42b62f67b913d0bace7abe4c6352c311d4d85a3 100644 --- a/H2HGCN/manifolds/StiefelManifold.py +++ b/H2HGCN/manifolds/StiefelManifold.py @@ -2,7 +2,7 @@ import torch as th import torch.nn as nn import numpy as np from torch.autograd import Function, Variable -from utils import * +from Ghypeddings.clusterers.utils import * _eps = 1e-10 diff --git a/HGCAE/hgcae.py b/HGCAE/hgcae.py index f7af9cb6b1cd04ba4584f37d605e9434689b4a96..e113fd08266c92747dc5ad9dfad6dbab6536a513 100644 --- a/HGCAE/hgcae.py +++ b/HGCAE/hgcae.py @@ -6,7 +6,7 @@ import os import time from Ghypeddings.HGCAE.utils.train_utils import get_dir_name, format_metrics from Ghypeddings.HGCAE.utils.data_utils import process_data -from Ghypeddings.HGCAE.utils.train_utils import create_args , get_classifier +from Ghypeddings.HGCAE.utils.train_utils import create_args , get_classifier,get_clustering_algorithm,get_anomaly_detection_algorithm import Ghypeddings.HGCAE.optimizers as optimizers from Ghypeddings.HGCAE.utils.data_utils import sparse_mx_to_torch_sparse_tensor @@ -50,10 +50,11 @@ class HGCAE(object): classifier=None, clusterer = None, normalize_adj=False, - normalize_feats=True + normalize_feats=True, + anomaly_detector=None ): - self.args = create_args(dim,hidden_dim,c,num_layers,bias,act,grad_clip,optimizer,weight_decay,lr,gamma,lr_reduce_freq,cuda,epochs,min_epochs,patience,seed,log_freq,eval_freq,val_prop,test_prop,double_precision,dropout,lambda_rec,lambda_lp,num_dec_layers,use_att,att_type,att_logit,beta,classifier,clusterer,normalize_adj,normalize_feats) + self.args = create_args(dim,hidden_dim,c,num_layers,bias,act,grad_clip,optimizer,weight_decay,lr,gamma,lr_reduce_freq,cuda,epochs,min_epochs,patience,seed,log_freq,eval_freq,val_prop,test_prop,double_precision,dropout,lambda_rec,lambda_lp,num_dec_layers,use_att,att_type,att_logit,beta,classifier,clusterer,normalize_adj,normalize_feats,anomaly_detector) self.cls = None self.args.n_nodes = adj.shape[0] @@ -183,8 +184,16 @@ class HGCAE(object): idx = np.unique(np.concatenate((train_idx,val_idx))) X = self.model.manifold.logmap0(self.best_emb[idx],self.model.encoder.curvatures[-1]).cpu().detach().numpy() y = self.data['labels'].reshape(-1,1)[idx] - self.cls = get_classifier(self.args, X,y) - acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,X,y) + + if(self.args.classifier): + self.cls = get_classifier(self.args, X,y) + acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,X,y) + elif self.args.clusterer: + y = y.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,X,y)[6:] + elif self.args.anomaly_detector: + y = y.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.anomaly_detector,X,y)[6:] return {'train':train_losses,'best':best_losses,'val':val_losses},acc,f1,recall,precision,roc_auc , time.time() - t_total @@ -195,7 +204,11 @@ class HGCAE(object): val_metrics = self.model.compute_metrics(embeddings, self.data, 'test') data = self.model.manifold.logmap0(embeddings[test_idx],self.model.encoder.curvatures[-1]).cpu().detach().numpy() labels = self.data['labels'].reshape(-1,1)[test_idx] - acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,data,labels) + if self.args.classifier: + acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,data,labels) + elif self.args.clusterer: + labels = labels.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,data,labels)[6:] return val_metrics['loss'].item(),acc,f1,recall,precision,roc_auc diff --git a/HGCAE/utils/train_utils.py b/HGCAE/utils/train_utils.py index 674b8b1661236d1525feae2574b9c2de7ae48419..41c1eaa1c0b36f53a74856ecc12447bb42364bd2 100644 --- a/HGCAE/utils/train_utils.py +++ b/HGCAE/utils/train_utils.py @@ -166,6 +166,7 @@ def create_args(*args): parser.add_argument('--clusterer', type=str, default=args[31]) parser.add_argument('--normalize_adj', type=bool, default=args[32]) parser.add_argument('--normalize_feats', type=bool, default=args[33]) + parser.add_argument('--anomaly_detector', type=str, default=args[34]) flags, unknown = parser.parse_known_args() return flags @@ -173,27 +174,46 @@ def create_args(*args): from Ghypeddings.classifiers import * def get_classifier(args,X,y): - if(args.classifier and args.clusterer): - print('You have to chose one of them!') - sys.exit(1) - elif(args.classifier): - if(args.classifier == 'svm'): - return SVM(X,y) - elif(args.classifier == 'mlp'): - return mlp(X,y,1,10,seed=args.seed) - elif(args.classifier == 'decision tree'): - return decision_tree(X,y) - elif(args.classifier == 'random forest'): - return random_forest(X,y,args.seed) - elif(args.classifier == 'adaboost'): - return adaboost(X,y,args.seed) - elif(args.classifier == 'knn'): - return KNN(X,y) - elif(args.classifier == 'naive bayes'): - return naive_bayes(X,y) - else: - raise NotImplementedError - elif(args.clusterer): - pass + if(args.classifier == 'svm'): + return SVM(X,y) + elif(args.classifier == 'mlp'): + return mlp(X,y,1,10,seed=args.seed) + elif(args.classifier == 'decision tree'): + return decision_tree(X,y) + elif(args.classifier == 'random forest'): + return random_forest(X,y,args.seed) + elif(args.classifier == 'adaboost'): + return adaboost(X,y,args.seed) + elif(args.classifier == 'knn'): + return KNN(X,y) + elif(args.classifier == 'naive bayes'): + return naive_bayes(X,y) + else: + raise NotImplementedError + +from Ghypeddings.clusterers import * +def get_clustering_algorithm(clusterer,X,y): + if(clusterer == 'agglomerative_clustering'): + return agglomerative_clustering(X,y) + elif(clusterer == 'dbscan'): + return dbscan(X,y) + elif(clusterer == 'fuzzy_c_mean'): + return fuzzy_c_mean(X,y) + elif(clusterer == 'gaussian_mixture'): + return gaussian_mixture(X,y) + elif(clusterer == 'kmeans'): + return kmeans(X,y) + elif(clusterer == 'mean_shift'): + return mean_shift(X,y) + else: + raise NotImplementedError + + +from Ghypeddings.anomaly_detection import * +def get_anomaly_detection_algorithm(algorithm,X,y): + if(algorithm == 'isolation_forest'): + return isolation_forest(X,y) + elif(algorithm == 'one_class_svm'): + return one_class_svm(X,y) else: - return 99,99,99,99,99 \ No newline at end of file + raise NotImplementedError \ No newline at end of file diff --git a/PVAE/pvae.py b/PVAE/pvae.py index 29dfa1df459601b8c41a0905a9f46f7c4b59b416..86b89f05e9b0cbab91f746f1a1a7a6cfe946c482 100644 --- a/PVAE/pvae.py +++ b/PVAE/pvae.py @@ -10,7 +10,7 @@ import numpy as np import logging import time -from Ghypeddings.PVAE.utils import probe_infnan , process_data , create_args , get_classifier +from Ghypeddings.PVAE.utils import probe_infnan , process_data , create_args , get_classifier,get_clustering_algorithm,get_anomaly_detection_algorithm import Ghypeddings.PVAE.objectives as objectives from Ghypeddings.PVAE.models import Tabular @@ -55,10 +55,11 @@ class PVAE: clusterer=None, log_freq=0, normalize_adj=False, - normalize_feats=True + normalize_feats=True, + anomaly_detector=None ): - self.args = create_args(dim,hidden_dim,num_layers,c,act,lr,cuda,epochs,seed,eval_freq,val_prop,test_prop,dropout,beta1,beta2,K,beta,analytical_kl,posterior,prior,prior_iso,prior_std,learn_prior_std,enc,dec,bias,alpha,classifier,clusterer,log_freq,normalize_adj,normalize_feats) + self.args = create_args(dim,hidden_dim,num_layers,c,act,lr,cuda,epochs,seed,eval_freq,val_prop,test_prop,dropout,beta1,beta2,K,beta,analytical_kl,posterior,prior,prior_iso,prior_std,learn_prior_std,enc,dec,bias,alpha,classifier,clusterer,log_freq,normalize_adj,normalize_feats,anomaly_detector) self.args.n_classes = len(np.unique(labels)) self.args.feat_dim = features.shape[1] self.data = process_data(self.args,adj,features,labels) @@ -168,8 +169,16 @@ class PVAE: idx = np.unique(np.concatenate((train_idx,val_idx))) X = self.model.manifold.logmap0(self.tb_embeddings[idx]).cpu().detach().numpy() y = self.data['labels'].cpu().reshape(-1,1)[idx] - self.cls = get_classifier(self.args, X,y) - acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,X,y) + + if(self.args.classifier): + self.cls = get_classifier(self.args, X,y) + acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,X,y) + elif self.args.clusterer: + y = y.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,X,y)[6:] + elif self.args.anomaly_detector: + y = y.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.clusterer,X,y)[6:] return {'train':train_losses,'best':best_losses,'val':val_losses},acc,f1,recall,precision,roc_auc,time.time() - t_total @@ -181,7 +190,15 @@ class PVAE: test_idx = self.data['idx_test'] data = self.model.manifold.logmap0(embeddings[0][test_idx]).cpu().detach().numpy() labels = self.data['labels'].reshape(-1,1).cpu()[test_idx] - acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,data,labels) + if self.args.classifier: + acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,data,labels) + elif self.args.clusterer: + labels = labels.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,data,labels)[6:] + elif self.args.anomaly_detector: + labels = labels.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.clusterer,data,labels)[6:] + return abs(tt_loss) , acc, f1 , recall,precision,roc_auc diff --git a/PVAE/utils.py b/PVAE/utils.py index 744f53e721afbd841224fc3db53b041a0dd10b1d..36d85574d9584cf5b56c9aa160acce357be1ecfe 100644 --- a/PVAE/utils.py +++ b/PVAE/utils.py @@ -280,6 +280,7 @@ def create_args(*args): parser.add_argument('--log_freq', type=int, default=args[29]) parser.add_argument('--normalize_adj', type=bool, default=args[30]) parser.add_argument('--normalize_feats', type=bool, default=args[31]) + parser.add_argument('--anomaly_detector', type=str, default=args[32]) flags, unknown = parser.parse_known_args() return flags @@ -301,10 +302,7 @@ def get_activation(args): from Ghypeddings.classifiers import * def get_classifier(args,X,y): - if(args.classifier and args.clusterer): - print('You have to chose one of them!') - sys.exit(1) - elif(args.classifier): + if(args.classifier): if(args.classifier == 'svm'): return SVM(X,y) elif(args.classifier == 'mlp'): @@ -321,7 +319,30 @@ def get_classifier(args,X,y): return naive_bayes(X,y) else: raise NotImplementedError - elif(args.clusterer): - pass + + +from Ghypeddings.clusterers import * +def get_clustering_algorithm(clusterer,X,y): + if(clusterer == 'agglomerative_clustering'): + return agglomerative_clustering(X,y) + elif(clusterer == 'dbscan'): + return dbscan(X,y) + elif(clusterer == 'fuzzy_c_mean'): + return fuzzy_c_mean(X,y) + elif(clusterer == 'gaussian_mixture'): + return gaussian_mixture(X,y) + elif(clusterer == 'kmeans'): + return kmeans(X,y) + elif(clusterer == 'mean_shift'): + return mean_shift(X,y) + else: + raise NotImplementedError + +from Ghypeddings.anomaly_detection import * +def get_anomaly_detection_algorithm(algorithm,X,y): + if(algorithm == 'isolation_forest'): + return isolation_forest(X,y) + elif(algorithm == 'one_class_svm'): + return one_class_svm(X,y) else: - return 99,99,99,99,99 \ No newline at end of file + raise NotImplementedError \ No newline at end of file diff --git a/anomaly_detection/__init__.py b/anomaly_detection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5fc56d706bb25fdf9eda05d19bfd06260f617f21 --- /dev/null +++ b/anomaly_detection/__init__.py @@ -0,0 +1,2 @@ +from Ghypeddings.anomaly_detection.isolation_forest import isolation_forest +from Ghypeddings.anomaly_detection.one_class_svm import one_class_svm \ No newline at end of file diff --git a/anomaly_detection/isolation_forest.py b/anomaly_detection/isolation_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c81c7809b0cadccb1c86fe6adda2be5cdfa95c --- /dev/null +++ b/anomaly_detection/isolation_forest.py @@ -0,0 +1,11 @@ +from Ghypeddings.anomaly_detection.utils import calculate_metrics + + +from sklearn.ensemble import IsolationForest + +def isolation_forest(X,y,anomalies_percentage = 0.5): + model = IsolationForest(contamination=anomalies_percentage) + model.fit(X) + y_pred = model.predict(X) + y_pred[y_pred == -1]=0 + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/anomaly_detection/one_class_svm.py b/anomaly_detection/one_class_svm.py new file mode 100644 index 0000000000000000000000000000000000000000..c383e8d0e2fa979a6fd8b1aff173ba75b62de572 --- /dev/null +++ b/anomaly_detection/one_class_svm.py @@ -0,0 +1,11 @@ +from Ghypeddings.anomaly_detection.utils import calculate_metrics + + +from sklearn.svm import OneClassSVM + +def one_class_svm(X,y, kernel='rbf',nu=0.1): + model = OneClassSVM(kernel=kernel, nu=nu) + model.fit(X) + y_pred = model.predict(X) + y_pred[y_pred == -1]=0 + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/anomaly_detection/utils.py b/anomaly_detection/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb39f3c0065e4a7cb253e3d3b3af23da024a88d --- /dev/null +++ b/anomaly_detection/utils.py @@ -0,0 +1,22 @@ +## external evaluation metrics +from sklearn.metrics import adjusted_rand_score +from sklearn.metrics import normalized_mutual_info_score +from sklearn.metrics import fowlkes_mallows_score +## additional evaluation metrics +from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score +## classification metrics +from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score + +def calculate_metrics(y_true,y_pred): + ari = adjusted_rand_score(y_true, y_pred) + nmi = normalized_mutual_info_score(y_true, y_pred) + fmi = fowlkes_mallows_score(y_true, y_pred) + homogeneity = homogeneity_score(y_true, y_pred) + completeness = completeness_score(y_true, y_pred) + v_measure = v_measure_score(y_true, y_pred) + acc = accuracy_score(y_true,y_pred) + f1 = f1_score(y_true,y_pred) + rec = recall_score(y_true,y_pred) + pre = precision_score(y_true,y_pred) + roc = roc_auc_score(y_true,y_pred) + return ari,nmi,fmi,homogeneity,completeness,v_measure,acc,f1,rec,pre,roc \ No newline at end of file diff --git a/clusterers/__init__.py b/clusterers/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5bb80fb0047624b6c0c404d8060daca7352dcaad 100644 --- a/clusterers/__init__.py +++ b/clusterers/__init__.py @@ -0,0 +1,6 @@ +from Ghypeddings.clusterers.ahc import agglomerative_clustering +from Ghypeddings.clusterers.dbscan import dbscan +from Ghypeddings.clusterers.fuzzy_c_mean import fuzzy_c_mean +from Ghypeddings.clusterers.gaussian_mixture import gaussian_mixture +from Ghypeddings.clusterers.kmeans import kmeans +from Ghypeddings.clusterers.mean_shift import mean_shift \ No newline at end of file diff --git a/clusterers/ahc.py b/clusterers/ahc.py new file mode 100644 index 0000000000000000000000000000000000000000..aee3bfd5933a498cb1ba8a3decbe687dd1aea4df --- /dev/null +++ b/clusterers/ahc.py @@ -0,0 +1,7 @@ +from sklearn.cluster import AgglomerativeClustering +from Ghypeddings.clusterers.utils import calculate_metrics + +def agglomerative_clustering(X,y,n_clusters =2, linkage = 'ward'): + model = AgglomerativeClustering(n_clusters=n_clusters,linkage=linkage) + labels = model.fit_predict(X) + return calculate_metrics(y,labels) \ No newline at end of file diff --git a/clusterers/dbscan.py b/clusterers/dbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..0b17e5594b7b28d447f932ffc395d540405890ce --- /dev/null +++ b/clusterers/dbscan.py @@ -0,0 +1,13 @@ +from Ghypeddings.clusterers.utils import calculate_metrics +from sklearn.cluster import DBSCAN + +def dbscan(X,y,eps=1e-4,min_samples=300): + model = DBSCAN(eps=eps, min_samples=min_samples) + y_pred = model.fit_predict(X) + mask = y_pred != -1 + y_true_filtered = y[mask] + y_pred_filtered = y_pred[mask] + y_pred_filtered[y_pred_filtered>0] = -1 + y_pred_filtered[y_pred_filtered == 0] = 1 + y_pred_filtered[y_pred_filtered == -1]=0 + return calculate_metrics(y_true_filtered,y_pred_filtered) \ No newline at end of file diff --git a/clusterers/fuzzy_c_mean.py b/clusterers/fuzzy_c_mean.py new file mode 100644 index 0000000000000000000000000000000000000000..af934eead709269782adb39722cf6924de0bc768 --- /dev/null +++ b/clusterers/fuzzy_c_mean.py @@ -0,0 +1,9 @@ +from Ghypeddings.clusterers.utils import calculate_metrics +import skfuzzy as fuzz +import numpy as np + +def fuzzy_c_mean(X,y,n_clusters=5,power=2,error=0.005,maxiter=1000,init=None): + X_transposed = np.transpose(X) + cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X_transposed, n_clusters, power, error=error, maxiter=maxiter, init=init) + y_pred = np.argmax(u, axis=0) + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/clusterers/gaussian_mixture.py b/clusterers/gaussian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..3405e019bb8b4ab64ca406bb62fb3de7c5e83e40 --- /dev/null +++ b/clusterers/gaussian_mixture.py @@ -0,0 +1,7 @@ +from sklearn.mixture import GaussianMixture +from Ghypeddings.clusterers.utils import calculate_metrics + +def gaussian_mixture(X,y,n_components=2): + model = GaussianMixture(n_components=2) + y_pred = model.fit_predict(X) + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/clusterers/kmeans.py b/clusterers/kmeans.py new file mode 100644 index 0000000000000000000000000000000000000000..59605e7ef034d6ca970e073582ee11f6b5aebbef --- /dev/null +++ b/clusterers/kmeans.py @@ -0,0 +1,11 @@ +from Ghypeddings.clusterers.utils import calculate_metrics + +from sklearn.cluster import KMeans + + +def kmeans(X,y,n_clusters=5,n_init=10): + model = KMeans(n_clusters=n_clusters,n_init=n_init) + model.fit(X) + y_pred = model.labels_ + y_pred[y_pred!=1]=0 + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/clusterers/mean_shift.py b/clusterers/mean_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..ba987548bf18300b323217b0f3e06df97188c92c --- /dev/null +++ b/clusterers/mean_shift.py @@ -0,0 +1,10 @@ +from Ghypeddings.clusterers.utils import calculate_metrics + +from sklearn.cluster import MeanShift + +def mean_shift(X,y): + y_pred = MeanShift().fit_predict(X) + y_pred[y_pred>0] = -1 + y_pred[y_pred == 0] = 1 + y_pred[y_pred == -1]=0 + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/clusterers/utils.py b/clusterers/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb39f3c0065e4a7cb253e3d3b3af23da024a88d --- /dev/null +++ b/clusterers/utils.py @@ -0,0 +1,22 @@ +## external evaluation metrics +from sklearn.metrics import adjusted_rand_score +from sklearn.metrics import normalized_mutual_info_score +from sklearn.metrics import fowlkes_mallows_score +## additional evaluation metrics +from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score +## classification metrics +from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score + +def calculate_metrics(y_true,y_pred): + ari = adjusted_rand_score(y_true, y_pred) + nmi = normalized_mutual_info_score(y_true, y_pred) + fmi = fowlkes_mallows_score(y_true, y_pred) + homogeneity = homogeneity_score(y_true, y_pred) + completeness = completeness_score(y_true, y_pred) + v_measure = v_measure_score(y_true, y_pred) + acc = accuracy_score(y_true,y_pred) + f1 = f1_score(y_true,y_pred) + rec = recall_score(y_true,y_pred) + pre = precision_score(y_true,y_pred) + roc = roc_auc_score(y_true,y_pred) + return ari,nmi,fmi,homogeneity,completeness,v_measure,acc,f1,rec,pre,roc \ No newline at end of file diff --git a/datasets/datasets.py b/datasets/datasets.py index 97c72e89efdca8e20b5d0f25696146b88096ced5..857e3a9ce7152c9156c92f0e998eed315b2d9051 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -21,27 +21,20 @@ class Dataset: def _get_files(self): return [os.path.join(self.directory,file) for file in os.listdir(self.directory) if os.path.isfile(os.path.join(self.directory, file)) and '.gitignore' not in file] - def save_samples(self,adj,features,labels,dim): - features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'features_{dim}.pkl') - adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'adjacency_{dim}.pkl') - labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'labels_{dim}.pkl') - - with open(adj_path,'wb') as f: + def save_samples(self,adj,features,labels): + with open(self.adj_path,'wb') as f: pickle.dump(adj,f) - with open(features_path,'wb') as f: + with open(self.features_path,'wb') as f: pickle.dump(features,f) - with open(labels_path,'wb') as f: + with open(self.labels_path,'wb') as f: pickle.dump(labels,f) - def load_samples(self,dim): - features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','features_{}.pkl'.format(dim)) - adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','adjacency_{}.pkl'.format(dim)) - labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','labels_{}.pkl'.format(dim)) - with open(adj_path,'rb') as f: + def load_samples(self): + with open(self.adj_path,'rb') as f: adj = pickle.load(f) - with open(features_path,'rb') as f: + with open(self.features_path,'rb') as f: features = pickle.load(f) - with open(labels_path,'rb') as f: + with open(self.labels_path,'rb') as f: labels = pickle.load(f) print('features:',features.shape,'adj',adj.shape,'labels',labels.shape) return adj,features,labels @@ -49,10 +42,13 @@ class Dataset: class CIC_DDoS2019(Dataset): def __init__(self): super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','labels.pkl'), directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','original') ) - def build(self,n_nodes,n_classes=2,dim=20): + def build(self,n_nodes,n_classes=2): df = self._create_file_bc(n_nodes,n_classes) for column in df.columns: max_value = df.loc[df[column] != np.inf, column].max() @@ -66,7 +62,7 @@ class CIC_DDoS2019(Dataset): features = df.to_numpy() scaler = MinMaxScaler() features = scaler.fit_transform(features) - self.save_samples(adj,features,labels,dim) + self.save_samples(adj,features,labels) return adj, features, labels def _load_file(self,path,max_per_class,list_classes=[]): diff --git a/datasets/examples/Darknet/adjacency.pkl b/datasets/examples/Darknet/adjacency.pkl index fa1f60a41f7d37add126fef2cb685febdd6875e8..17e3b4d1cb240f0f5d6e5a7cc0eaf0236d43c0da 100644 Binary files a/datasets/examples/Darknet/adjacency.pkl and b/datasets/examples/Darknet/adjacency.pkl differ diff --git a/datasets/examples/Darknet/features.pkl b/datasets/examples/Darknet/features.pkl index 4ea23f7bf5b38012ff1fb3345c65dca8ad61d382..e9b20fe5f12fed056ee107dbeb727de4fd38045a 100644 Binary files a/datasets/examples/Darknet/features.pkl and b/datasets/examples/Darknet/features.pkl differ diff --git a/datasets/examples/Darknet/labels.pkl b/datasets/examples/Darknet/labels.pkl index e08f72b50401b3eef8f68fbc76c609d3df6d03c4..66930070c334ac355b0d98a994ca4780fac9bfad 100644 Binary files a/datasets/examples/Darknet/labels.pkl and b/datasets/examples/Darknet/labels.pkl differ diff --git a/datasets/test_dataset.py b/datasets/test_dataset.py index 857e3a9ce7152c9156c92f0e998eed315b2d9051..97c72e89efdca8e20b5d0f25696146b88096ced5 100644 --- a/datasets/test_dataset.py +++ b/datasets/test_dataset.py @@ -21,20 +21,27 @@ class Dataset: def _get_files(self): return [os.path.join(self.directory,file) for file in os.listdir(self.directory) if os.path.isfile(os.path.join(self.directory, file)) and '.gitignore' not in file] - def save_samples(self,adj,features,labels): - with open(self.adj_path,'wb') as f: + def save_samples(self,adj,features,labels,dim): + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'features_{dim}.pkl') + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'adjacency_{dim}.pkl') + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'labels_{dim}.pkl') + + with open(adj_path,'wb') as f: pickle.dump(adj,f) - with open(self.features_path,'wb') as f: + with open(features_path,'wb') as f: pickle.dump(features,f) - with open(self.labels_path,'wb') as f: + with open(labels_path,'wb') as f: pickle.dump(labels,f) - def load_samples(self): - with open(self.adj_path,'rb') as f: + def load_samples(self,dim): + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','features_{}.pkl'.format(dim)) + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','adjacency_{}.pkl'.format(dim)) + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','labels_{}.pkl'.format(dim)) + with open(adj_path,'rb') as f: adj = pickle.load(f) - with open(self.features_path,'rb') as f: + with open(features_path,'rb') as f: features = pickle.load(f) - with open(self.labels_path,'rb') as f: + with open(labels_path,'rb') as f: labels = pickle.load(f) print('features:',features.shape,'adj',adj.shape,'labels',labels.shape) return adj,features,labels @@ -42,13 +49,10 @@ class Dataset: class CIC_DDoS2019(Dataset): def __init__(self): super().__init__( - features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','features.pkl'), - adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','adjacency.pkl'), - labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','labels.pkl'), directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','original') ) - def build(self,n_nodes,n_classes=2): + def build(self,n_nodes,n_classes=2,dim=20): df = self._create_file_bc(n_nodes,n_classes) for column in df.columns: max_value = df.loc[df[column] != np.inf, column].max() @@ -62,7 +66,7 @@ class CIC_DDoS2019(Dataset): features = df.to_numpy() scaler = MinMaxScaler() features = scaler.fit_transform(features) - self.save_samples(adj,features,labels) + self.save_samples(adj,features,labels,dim) return adj, features, labels def _load_file(self,path,max_per_class,list_classes=[]):