diff --git a/H2HGCN/h2hgcn.py b/H2HGCN/h2hgcn.py index ce5b879408717446eb8481839d7af23a09136650..4d92fa94092b6f9e1ba716be17264f30d4765d7e 100644 --- a/H2HGCN/h2hgcn.py +++ b/H2HGCN/h2hgcn.py @@ -45,8 +45,8 @@ class H2HGCN: seed=42, log_freq=1, eval_freq=1, - val_prop=.2, - test_prop=0.3, + val_prop=0.15, + test_prop=0.15, double_precision=0, dropout=0.1, normalize_adj=False, diff --git a/HGCAE/hgcae.py b/HGCAE/hgcae.py index e113fd08266c92747dc5ad9dfad6dbab6536a513..614885ff479917f83335decb0a7ceb0a90e54a14 100644 --- a/HGCAE/hgcae.py +++ b/HGCAE/hgcae.py @@ -6,7 +6,7 @@ import os import time from Ghypeddings.HGCAE.utils.train_utils import get_dir_name, format_metrics from Ghypeddings.HGCAE.utils.data_utils import process_data -from Ghypeddings.HGCAE.utils.train_utils import create_args , get_classifier,get_clustering_algorithm,get_anomaly_detection_algorithm +from Ghypeddings.HGCAE.utils.train_utils import create_args , get_classifier ,get_clustering_algorithm,get_anomaly_detection_algorithm import Ghypeddings.HGCAE.optimizers as optimizers from Ghypeddings.HGCAE.utils.data_utils import sparse_mx_to_torch_sparse_tensor @@ -26,7 +26,7 @@ class HGCAE(object): grad_clip=None, optimizer='RiemannianAdam', weight_decay=0.01, - lr=0.01, + lr=0.001, gamma=0.5, lr_reduce_freq=500, cuda=0, @@ -34,9 +34,9 @@ class HGCAE(object): min_epochs=50, patience=None, seed=42, - log_freq=0, + log_freq=1, eval_freq=1, - val_prop=.2, + val_prop=0.0002, test_prop=0.3, double_precision=0, dropout=0.1, @@ -134,6 +134,7 @@ class HGCAE(object): self.optimizer.zero_grad() embeddings = self.model.encode(self.data['features'], self.adj_train_enc) train_metrics = self.model.compute_metrics(embeddings, self.data, 'train', epoch) + print(train_metrics) train_metrics['loss'].backward() if self.args.grad_clip is not None: max_norm = float(self.args.grad_clip) @@ -162,40 +163,41 @@ class HGCAE(object): if (epoch + 1) % self.args.eval_freq == 0: self.model.eval() embeddings = self.model.encode(self.data['features'], self.adj_train_enc) - val_metrics = self.model.compute_metrics(embeddings, self.data, 'val') - val_losses.append(val_metrics['loss'].item()) - if (epoch + 1) % self.args.log_freq == 0: - logging.info(" ".join(['Epoch: {:04d}'.format(epoch + 1), format_metrics(val_metrics, 'val')])) - if self.model.has_improved(best_val_metrics, val_metrics): - self.best_emb = embeddings - best_val_metrics = val_metrics - counter = 0 - else: - counter += 1 - if counter == self.args.patience and epoch > self.args.min_epochs: - logging.info("Early stopping") - break + #val_metrics = self.model.compute_metrics(embeddings, self.data, 'val') + # val_losses.append(val_metrics['loss'].item()) + # if (epoch + 1) % self.args.log_freq == 0: + # logging.info(" ".join(['Epoch: {:04d}'.format(epoch + 1), format_metrics(val_metrics, 'val')])) + # if self.model.has_improved(best_val_metrics, val_metrics): + # self.best_emb = embeddings + # best_val_metrics = val_metrics + # counter = 0 + # else: + # counter += 1 + # if counter == self.args.patience and epoch > self.args.min_epochs: + # logging.info("Early stopping") + # break logging.info("Training Finished!") logging.info("Total time elapsed: {:.4f}s".format(time.time() - t_total)) - train_idx = np.unique(self.data['train_edges'][:,0].cpu().detach().numpy()) - val_idx = np.unique(self.data['val_edges'][:,0].cpu().detach().numpy()) - idx = np.unique(np.concatenate((train_idx,val_idx))) - X = self.model.manifold.logmap0(self.best_emb[idx],self.model.encoder.curvatures[-1]).cpu().detach().numpy() - y = self.data['labels'].reshape(-1,1)[idx] - - if(self.args.classifier): - self.cls = get_classifier(self.args, X,y) - acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,X,y) - elif self.args.clusterer: - y = y.reshape(-1,) - acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,X,y)[6:] - elif self.args.anomaly_detector: - y = y.reshape(-1,) - acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.anomaly_detector,X,y)[6:] + # train_idx = np.unique(self.data['train_edges'][:,0].cpu().detach().numpy()) + # val_idx = np.unique(self.data['val_edges'][:,0].cpu().detach().numpy()) + # idx = np.unique(np.concatenate((train_idx,val_idx))) + # X = self.model.manifold.logmap0(self.best_emb[idx],self.model.encoder.curvatures[-1]).cpu().detach().numpy() + # y = self.data['labels'].reshape(-1,1)[idx] + + # if(self.args.classifier): + # self.cls = get_classifier(self.args, X,y) + # acc,f1,recall,precision,roc_auc = calculate_metrics(self.cls,X,y) + # elif self.args.clusterer: + # y = y.reshape(-1,) + # acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,X,y)[6:] + # elif self.args.anomaly_detector: + # y = y.reshape(-1,) + # acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.anomaly_detector,X,y)[6:] - return {'train':train_losses,'best':best_losses,'val':val_losses},acc,f1,recall,precision,roc_auc , time.time() - t_total + # return {'train':train_losses,'best':best_losses,'val':val_losses},acc,f1,recall,precision,roc_auc , time.time() - t_total + return {'train':train_losses,'best':best_losses,'val':val_losses}, time.time() - t_total def predict(self): self.model.eval() @@ -209,14 +211,20 @@ class HGCAE(object): elif self.args.clusterer: labels = labels.reshape(-1,) acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,data,labels)[6:] + elif self.args.anomaly_detector: + labels = labels.reshape(-1,) + acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.anomaly_detector,data,labels)[6:] + self.tb_embeddings = embeddings return val_metrics['loss'].item(),acc,f1,recall,precision,roc_auc - def save_embeddings(self,directory,prefix): - tb_embeddings_euc = self.model.manifold.logmap0(self.best_emb,self.model.encoder.curvatures[-1]) - for_classification_hyp = np.hstack((self.best_emb.cpu().detach().numpy(),self.data['labels'].reshape(-1,1))) + def save_embeddings(self,directory): + self.model.eval() + embeddings = self.model.encode(self.data['features'], self.adj_train_enc) + tb_embeddings_euc = self.model.manifold.logmap0(embeddings,self.model.encoder.curvatures[-1]) + for_classification_hyp = np.hstack((embeddings.cpu().detach().numpy(),self.data['labels'].reshape(-1,1))) for_classification_euc = np.hstack((tb_embeddings_euc.cpu().detach().numpy(),self.data['labels'].reshape(-1,1))) - hyp_file_path = os.path.join(directory,f'{prefix}_embeddings_hyp.csv') - euc_file_path = os.path.join(directory,f'{prefix}_embeddings_euc.csv') + hyp_file_path = os.path.join(directory,'hgcae_embeddings_hyp.csv') + euc_file_path = os.path.join(directory,'hgcae_embeddings_euc.csv') np.savetxt(hyp_file_path, for_classification_hyp, delimiter=',') np.savetxt(euc_file_path, for_classification_euc, delimiter=',') \ No newline at end of file diff --git a/HGCAE/utils/train_utils.py b/HGCAE/utils/train_utils.py index 41c1eaa1c0b36f53a74856ecc12447bb42364bd2..42026c479f9cbe09851ff95469669c1cd292b1f0 100644 --- a/HGCAE/utils/train_utils.py +++ b/HGCAE/utils/train_utils.py @@ -215,5 +215,11 @@ def get_anomaly_detection_algorithm(algorithm,X,y): return isolation_forest(X,y) elif(algorithm == 'one_class_svm'): return one_class_svm(X,y) + elif(algorithm == 'dbscan'): + return dbscan(X,y) + elif(algorithm == 'kmeans'): + return kmeans(X,y) + elif(algorithm == 'local_outlier_factor'): + return local_outlier_factor(X,y) else: raise NotImplementedError \ No newline at end of file diff --git a/HGCN/hgcn.py b/HGCN/hgcn.py index ac5bffb4b27db6b9f934a44e1a21ba8a12441c8e..84c735f9a3aae0aeb53bf20e2c72fc1ad8762f53 100644 --- a/HGCN/hgcn.py +++ b/HGCN/hgcn.py @@ -26,11 +26,11 @@ class HGCN: num_layers=2, bias=True, act='relu', - select_manifold='Hyperboloid', + select_manifold='Euclidean', #Euclidean , Hyperboloid grad_clip=1.0, - optimizer='RiemannianAdam', + optimizer='Adam', #Adam , RiemannianAdam weight_decay=0.01, - lr=0.009, + lr=0.1, #0.009 gamma=0.5, lr_reduce_freq=200, cuda=0, @@ -38,14 +38,14 @@ class HGCN: min_epochs=50, patience=None, seed=42, - log_freq=0, + log_freq=1, eval_freq=1, - val_prop=.2, - test_prop=0.3, + val_prop=0.15, + test_prop=0.15, double_precision=0, dropout=0.1, use_att= True, - alpha=0.2, + alpha=0.5, local_agg = False, normalize_adj=False, normalize_feats=True diff --git a/HGCN/layers/layers.py b/HGCN/layers/layers.py index c7682cfd33b7a7dcd723558c4722e93a92ff4510..c2eeb70eafdda7141eb6047a7ffbb37c14f6a910 100644 --- a/HGCN/layers/layers.py +++ b/HGCN/layers/layers.py @@ -23,4 +23,49 @@ class Linear(Module): hidden = F.dropout(hidden, self.dropout, training=self.training) out = self.act(hidden) return out - \ No newline at end of file + + + +def get_dim_act(args): + """ + Helper function to get dimension and activation at every layer. + :param args: + :return: + """ + if not args.act: + act = lambda x: x + else: + act = getattr(F, args.act) + acts = [act] * (args.num_layers - 1) + dims = [args.feat_dim] + ([args.dim] * (args.num_layers - 1)) + return dims, acts + + +class GraphConvolution(Module): + """ + Simple GCN layer. + """ + + def __init__(self, in_features, out_features, dropout, act, use_bias): + super(GraphConvolution, self).__init__() + self.dropout = dropout + self.linear = nn.Linear(in_features, out_features, use_bias) + self.act = act + self.in_features = in_features + self.out_features = out_features + + def forward(self, input): + x, adj = input + hidden = self.linear.forward(x) + hidden = F.dropout(hidden, self.dropout, training=self.training) + if adj.is_sparse: + support = torch.spmm(adj, hidden) + else: + support = torch.mm(adj, hidden) + output = self.act(support), adj + return output + + def extra_repr(self): + return 'input_dim={}, output_dim={}'.format( + self.in_features, self.out_features + ) diff --git a/HGCN/models/base_models.py b/HGCN/models/base_models.py index e9acf7a144744da2739339a436c9212629053479..00f5628c36362d96a94bd153b0e5abfb44ad20ff 100644 --- a/HGCN/models/base_models.py +++ b/HGCN/models/base_models.py @@ -30,7 +30,7 @@ class BaseModel(nn.Module): if self.manifold.name == 'Hyperboloid': args.feat_dim = args.feat_dim + 1 self.nnodes = args.n_nodes - self.encoder = getattr(encoders, 'HGCN')(self.c, args) + self.encoder = getattr(encoders, args.model)(self.c, args) def encode(self, x, adj): if self.manifold.name == 'Hyperboloid': diff --git a/HGCN/models/encoders.py b/HGCN/models/encoders.py index 344b8dd35f7d76f0783daeddaa6243beb5393680..c82c611b2bae12bbf192813e906daa001eac822f 100644 --- a/HGCN/models/encoders.py +++ b/HGCN/models/encoders.py @@ -9,6 +9,8 @@ import Ghypeddings.HGCN.manifolds as manifolds import Ghypeddings.HGCN.layers.hyp_layers as hyp_layers import Ghypeddings.HGCN.utils.math_utils as pmath +from Ghypeddings.HGCN.layers.layers import GraphConvolution, Linear, get_dim_act +from Ghypeddings.HGCN.layers.att_layers import GraphAttentionLayer class Encoder(nn.Module): """ @@ -27,6 +29,45 @@ class Encoder(nn.Module): output = self.layers.forward(x) return output +class GCN(Encoder): + """ + Graph Convolution Networks. + """ + + def __init__(self, c, args): + super(GCN, self).__init__(c) + assert args.num_layers > 0 + dims, acts = get_dim_act(args) + gc_layers = [] + for i in range(len(dims) - 1): + in_dim, out_dim = dims[i], dims[i + 1] + act = acts[i] + gc_layers.append(GraphConvolution(in_dim, out_dim, args.dropout, act, args.bias)) + self.layers = nn.Sequential(*gc_layers) + self.encode_graph = True + +class GAT(Encoder): + """ + Graph Attention Networks. + """ + + def __init__(self, c, args): + super(GAT, self).__init__(c) + assert args.num_layers > 0 + dims, acts = get_dim_act(args) + gat_layers = [] + for i in range(len(dims) - 1): + in_dim, out_dim = dims[i], dims[i + 1] + act = acts[i] + assert dims[i + 1] % args.n_heads == 0 + out_dim = dims[i + 1] // args.n_heads + concat = True + gat_layers.append( + GraphAttentionLayer(in_dim, out_dim, args.dropout, act, args.alpha, args.n_heads, concat)) + self.layers = nn.Sequential(*gat_layers) + self.encode_graph = True + + class HGCN(Encoder): """ Hyperbolic-GCN. diff --git a/HGCN/utils/data_utils.py b/HGCN/utils/data_utils.py index 3f037648e2dbf1127ac5808707c459fa2d6989d7..5169f98c576ae898769d8d3ffe5f4133283af93f 100644 --- a/HGCN/utils/data_utils.py +++ b/HGCN/utils/data_utils.py @@ -64,8 +64,6 @@ def augment(adj, features, normalize_feats=True): def split_data(labels, val_prop, test_prop, seed): np.random.seed(seed) - nb_nodes = labels.shape[0] - all_idx = np.arange(nb_nodes) pos_idx = labels.nonzero()[0] neg_idx = (1. - labels).nonzero()[0] np.random.shuffle(pos_idx) diff --git a/HGCN/utils/train_utils.py b/HGCN/utils/train_utils.py index 6e4385c5c977b1ea47ee9ffb6afe1d7f013c7fcc..296451e4795df11d93ffae8d345da6c1aba740fe 100644 --- a/HGCN/utils/train_utils.py +++ b/HGCN/utils/train_utils.py @@ -41,5 +41,7 @@ def create_args(*args): parser.add_argument('--local_agg', type=bool, default=args[25]) parser.add_argument('--normalize_adj', type=bool, default=args[26]) parser.add_argument('--normalize_feats', type=bool, default=args[27]) + parser.add_argument('--model', type=str, default='GAT') #GCN, GAT,HGCN + parser.add_argument('--n_heads', type=int, default=1) #GCN, GAT,HGCN flags, unknown = parser.parse_known_args() return flags \ No newline at end of file diff --git a/HGNN/hgnn.py b/HGNN/hgnn.py index c5b1fe60d15606a68e9a83a7d3fc4a8ee5d9a2ad..d8702dec1502ad91bef985a6c3d4045929c17ff5 100644 --- a/HGNN/hgnn.py +++ b/HGNN/hgnn.py @@ -33,8 +33,8 @@ class HGNN: seed=42, log_freq=1, eval_freq=1, - val_prop=.2, - test_prop=0.3, + val_prop=0.15, + test_prop=0.15, double_precision=0, dropout=0.01, normalize_adj=False, diff --git a/PVAE/pvae.py b/PVAE/pvae.py index 86b89f05e9b0cbab91f746f1a1a7a6cfe946c482..b1318928f98252ce80bf76225c45cbc63037d794 100644 --- a/PVAE/pvae.py +++ b/PVAE/pvae.py @@ -28,18 +28,18 @@ class PVAE: hidden_dim, num_layers=2, c=1.0, - act='leaky_relu', + act='relu', lr=0.01, cuda=0, epochs=50, seed=42, eval_freq=1, - val_prop=0.2, + val_prop=0., test_prop=0.3, dropout=0.1, beta1=0.9, beta2=.999, - K=10, + K=1, beta=.2, analytical_kl=True, posterior='WrappedNormal', @@ -53,7 +53,7 @@ class PVAE: alpha=0.5, classifier=None, clusterer=None, - log_freq=0, + log_freq=1, normalize_adj=False, normalize_feats=True, anomaly_detector=None @@ -78,7 +78,6 @@ class PVAE: # Choosing and saving a random seed for reproducibility if self.args.seed == 0: self.args.seed = int(torch.randint(0, 2**32 - 1, (1,)).item()) - print('seed', self.args.seed) torch.manual_seed(self.args.seed) np.random.seed(self.args.seed) torch.cuda.manual_seed_all(self.args.seed) @@ -145,7 +144,7 @@ class PVAE: if (epoch + 1) % self.args.log_freq == 0: print('====> Epoch: {:03d} Loss: {:.2f} Recon: {:.2f} KL: {:.2f}'.format(epoch, agg['train_loss'][-1], agg['train_recon'][-1], agg['train_kl'][-1])) - if (epoch + 1) % self.args.eval_freq == 0: + if (epoch + 1) % self.args.eval_freq == 0 and self.args.val_prop: self.model.eval() with torch.no_grad(): qz_x, px_z, lik, kl, loss , embeddings= self.loss_function(self.model,self.data['idx_val'], self.data['features'],self.data['adj_train'], K=self.args.K, beta=self.args.beta, components=True) @@ -178,7 +177,7 @@ class PVAE: acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,X,y)[6:] elif self.args.anomaly_detector: y = y.reshape(-1,) - acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.clusterer,X,y)[6:] + acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.anomaly_detector,X,y)[6:] return {'train':train_losses,'best':best_losses,'val':val_losses},acc,f1,recall,precision,roc_auc,time.time() - t_total @@ -197,16 +196,16 @@ class PVAE: acc,f1,recall,precision,roc_auc = get_clustering_algorithm(self.args.clusterer,data,labels)[6:] elif self.args.anomaly_detector: labels = labels.reshape(-1,) - acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.clusterer,data,labels)[6:] - + acc,f1,recall,precision,roc_auc = get_anomaly_detection_algorithm(self.args.anomaly_detector,data,labels)[6:] + self.tb_embeddings = embeddings[0] return abs(tt_loss) , acc, f1 , recall,precision,roc_auc - def save_embeddings(self,directory,prefix): + def save_embeddings(self,directory): tb_embeddings_euc = self.model.manifold.logmap0(self.tb_embeddings) for_classification_hyp = np.hstack((self.tb_embeddings.cpu().detach().numpy(),self.data['labels'].reshape(-1,1).cpu())) for_classification_euc = np.hstack((tb_embeddings_euc.cpu().detach().numpy(),self.data['labels'].reshape(-1,1).cpu())) - hyp_file_path = os.path.join(directory,f'{prefix}_embeddings_hyp.csv') - euc_file_path = os.path.join(directory,f'{prefix}_embeddings_euc.csv') + hyp_file_path = os.path.join(directory,'pvae_embeddings_hyp.csv') + euc_file_path = os.path.join(directory,'pvae_embeddings_euc.csv') np.savetxt(hyp_file_path, for_classification_hyp, delimiter=',') np.savetxt(euc_file_path, for_classification_euc, delimiter=',') diff --git a/PVAE/utils.py b/PVAE/utils.py index 36d85574d9584cf5b56c9aa160acce357be1ecfe..2f935958fa8d55d60f320c19fc8f1dc8a183ef6a 100644 --- a/PVAE/utils.py +++ b/PVAE/utils.py @@ -199,7 +199,8 @@ def split_data(labels, test_prop,val_prop): nb_val + nb_test:] idx_val_neg, idx_test_neg, idx_train_neg = neg_idx[:nb_val], neg_idx[nb_val:nb_val + nb_test], neg_idx[ nb_val + nb_test:] - return idx_val_pos + idx_val_neg, idx_test_pos + idx_test_neg, idx_train_pos + idx_train_neg + + return idx_test_pos + idx_test_neg, idx_train_pos + idx_train_neg, idx_val_pos + idx_val_neg, def process_data(args, adj,features,labels): data = process_data_nc(args,adj,features,labels) @@ -344,5 +345,11 @@ def get_anomaly_detection_algorithm(algorithm,X,y): return isolation_forest(X,y) elif(algorithm == 'one_class_svm'): return one_class_svm(X,y) + elif(algorithm == 'dbscan'): + return dbscan(X,y) + elif(algorithm == 'kmeans'): + return kmeans(X,y,n_clusters=2) + elif(algorithm == 'local_outlier_factor'): + return local_outlier_factor(X,y) else: raise NotImplementedError \ No newline at end of file diff --git a/Poincare/poincare.py b/Poincare/poincare.py index c2eeecc00b27433a1f35d964619dd5d230841a21..018bf5b4053a4d26b1051d4df15598e8b95cfacc 100644 --- a/Poincare/poincare.py +++ b/Poincare/poincare.py @@ -31,12 +31,13 @@ class POINCARE: seed=42, log_freq=1, eval_freq=1, - val_prop=0.2, - test_prop=0.3, + val_prop=0.15, + test_prop=0.15, double_precision=0, dropout=0.01, normalize_adj=False, normalize_feats=True): + self.args = create_args(dim,grad_clip,weight_decay,lr,gamma,lr_reduce_freq,cuda,epochs,min_epochs,patience,seed,log_freq,eval_freq,val_prop,test_prop,double_precision,dropout,normalize_adj,normalize_feats) self.args.n_nodes = adj.shape[0] self.args.feat_dim = features.shape[1] diff --git a/README.md b/README.md index f8a1bea3df8f0b83f20034c24584eb4d8bf4243b..929877df60fb0bf94e6a5483c54afbd94e57fde9 100644 --- a/README.md +++ b/README.md @@ -41,10 +41,10 @@ In this library, we provide a variety of binary classifiers, clustering algorith The following intrusion detection datasets were used to test and evaluate the models. Our code includes all the pre-processing steps required to convert these datasets from tabular format into graphs. Due to usage restrictions, this library provides only a single graph of each dataset, with 5,000 nodes, already pre-processed and normalized. -| Name | Features | Used features | Hyperbolicity | Ref | -|-----------------|----------|----------------|---------------|-------| -| CIC-DDoS2019 | 80 | 76 | 1.0 | [7] | -| AWID3 | Cell 5 | Cell 6 | Cell 7 | | +| Name | Ref | +|-----------------|-------| +| CIC-DDoS2019 | [7] | +| AWID3 | | diff --git a/__init__.py b/__init__.py index de2f3397b350d33db935f55fb55b45efa4a65367..37936737e7ef84fa8bc1299b9dbd2053d56c48b3 100644 --- a/__init__.py +++ b/__init__.py @@ -6,9 +6,9 @@ from Ghypeddings.Poincare.poincare import POINCARE from Ghypeddings.PVAE.pvae import PVAE from Ghypeddings.datasets.datasets import CIC_DDoS2019 -from Ghypeddings.datasets.datasets import CIC_IDS2018 -from Ghypeddings.datasets.datasets import UNSW_NB15 +from Ghypeddings.datasets.datasets import NF_CIC_IDS2018_v2 +from Ghypeddings.datasets.datasets import NF_UNSW_NB15_v2 from Ghypeddings.datasets.datasets import Darknet from Ghypeddings.datasets.datasets import AWID3 -from Ghypeddings.datasets.datasets import TON_IoT -from Ghypeddings.datasets.datasets import BOT_IoT \ No newline at end of file +from Ghypeddings.datasets.datasets import NF_TON_IoT_v2 +from Ghypeddings.datasets.datasets import NF_BOT_IoT_v2 \ No newline at end of file diff --git a/anomaly_detection/__init__.py b/anomaly_detection/__init__.py index 5fc56d706bb25fdf9eda05d19bfd06260f617f21..41092b1e9bc9d3fa72849637f8a78dcda3a06fcf 100644 --- a/anomaly_detection/__init__.py +++ b/anomaly_detection/__init__.py @@ -1,2 +1,5 @@ from Ghypeddings.anomaly_detection.isolation_forest import isolation_forest -from Ghypeddings.anomaly_detection.one_class_svm import one_class_svm \ No newline at end of file +from Ghypeddings.anomaly_detection.one_class_svm import one_class_svm +from Ghypeddings.anomaly_detection.dbscan import dbscan +from Ghypeddings.anomaly_detection.kmeans import kmeans +from Ghypeddings.anomaly_detection.local_outlier_factor import local_outlier_factor \ No newline at end of file diff --git a/anomaly_detection/dbscan.py b/anomaly_detection/dbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..00bc3d669316442b5edb4595f65872f49361a755 --- /dev/null +++ b/anomaly_detection/dbscan.py @@ -0,0 +1,9 @@ +from sklearn.cluster import DBSCAN +from Ghypeddings.anomaly_detection.utils import calculate_metrics + + +def dbscan(X,y): + dbscan = DBSCAN(eps=0.5, min_samples=5) + labels = dbscan.fit_predict(X) + outliers = labels == -1 + return calculate_metrics(y,outliers) diff --git a/anomaly_detection/isolation_forest.py b/anomaly_detection/isolation_forest.py index f8c81c7809b0cadccb1c86fe6adda2be5cdfa95c..52ea90463b1026ac8d482f240f9bb5b4a64219d4 100644 --- a/anomaly_detection/isolation_forest.py +++ b/anomaly_detection/isolation_forest.py @@ -3,9 +3,10 @@ from Ghypeddings.anomaly_detection.utils import calculate_metrics from sklearn.ensemble import IsolationForest -def isolation_forest(X,y,anomalies_percentage = 0.5): +def isolation_forest(X,y,anomalies_percentage = 0.1): model = IsolationForest(contamination=anomalies_percentage) model.fit(X) y_pred = model.predict(X) - y_pred[y_pred == -1]=0 + y_pred[y_pred == 1] = 0 + y_pred[y_pred == -1]= 1 return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/anomaly_detection/kmeans.py b/anomaly_detection/kmeans.py new file mode 100644 index 0000000000000000000000000000000000000000..a5fbfc8343bda122043d33743def26ba18dfcd5c --- /dev/null +++ b/anomaly_detection/kmeans.py @@ -0,0 +1,12 @@ +from sklearn.cluster import KMeans +from Ghypeddings.anomaly_detection.utils import calculate_metrics +import numpy as np + +def kmeans(X,y,n_clusters,outlier_percentage=.1): + model = KMeans(n_clusters=n_clusters) + model.fit(X) + # y_pred = model.predict(X) + distances = model.transform(X).min(axis=1) + threshold = np.percentile(distances, 100 * (1 - outlier_percentage)) + outliers = distances > threshold + return calculate_metrics(y,outliers) \ No newline at end of file diff --git a/anomaly_detection/local_outlier_factor.py b/anomaly_detection/local_outlier_factor.py new file mode 100644 index 0000000000000000000000000000000000000000..36caa7022fafb9f826a7e3200d0b637fb9cf7679 --- /dev/null +++ b/anomaly_detection/local_outlier_factor.py @@ -0,0 +1,10 @@ +from sklearn.neighbors import LocalOutlierFactor +from Ghypeddings.anomaly_detection.utils import calculate_metrics +import numpy as np + +def local_outlier_factor(X,y,n_neighbors=20,outlier_percentage=.1): + lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=outlier_percentage) + y_pred = lof.fit_predict(X) + y_pred[y_pred == 1] = 0 + y_pred[y_pred == -1] = 1 + return calculate_metrics(y,y_pred) \ No newline at end of file diff --git a/classifiers/random_forest.py b/classifiers/random_forest.py index 1e044f3bdee8aab4377cf55fb9a8b050c03e6caf..24c10c46fe8c1c0d507f0cf621e90c0f642481ae 100644 --- a/classifiers/random_forest.py +++ b/classifiers/random_forest.py @@ -1,5 +1,5 @@ from sklearn.ensemble import RandomForestClassifier -def random_forest(X,y,seed,n_estimators=2,max_depth=2,max_features=None): +def random_forest(X,y,seed,n_estimators=10,max_depth=10,max_features='log2'): clf = RandomForestClassifier(max_features=max_features,n_estimators=n_estimators, max_depth=max_depth, random_state=seed) return clf.fit(X, y) \ No newline at end of file diff --git a/clusterers/kmeans.py b/clusterers/kmeans.py index 59605e7ef034d6ca970e073582ee11f6b5aebbef..848fef469ae29d55cdaf097e4a8df8057f89e2d4 100644 --- a/clusterers/kmeans.py +++ b/clusterers/kmeans.py @@ -3,7 +3,7 @@ from Ghypeddings.clusterers.utils import calculate_metrics from sklearn.cluster import KMeans -def kmeans(X,y,n_clusters=5,n_init=10): +def kmeans(X,y,n_clusters=2,n_init=10): model = KMeans(n_clusters=n_clusters,n_init=n_init) model.fit(X) y_pred = model.labels_ diff --git a/datasets/.gitignore b/datasets/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d22b9a22608a3bacbc730c3ad7c080f98c9ead54 --- /dev/null +++ b/datasets/.gitignore @@ -0,0 +1,3 @@ +outlier_datasets.py + +repetition_datasets.py \ No newline at end of file diff --git a/datasets/datasets.py b/datasets/datasets.py index 857e3a9ce7152c9156c92f0e998eed315b2d9051..db7cd2c8d4b13fe88533e9cfb6219fd5afbcc2c1 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -111,7 +111,7 @@ class NetFlowDataset(Dataset): self.file = file def build(self,n_nodes,n_classes=2): - df = pd.read_csv(self.file) + df = pd.read_csv(self.file) df = df.groupby(['Label']).apply(lambda x: x.sample(int(n_nodes/n_classes))).reset_index(drop=True) df = df.sample(frac=1).reset_index(drop=True) adj = self._filling_adjacency_numpy(df) @@ -152,7 +152,7 @@ class NetFlowDataset(Dataset): adjacency[mask] = True return adjacency -class CIC_IDS2018(NetFlowDataset): +class NF_CIC_IDS2018_v2(NetFlowDataset): def __init__(self): super().__init__( features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CIC_IDS2018','features.pkl'), @@ -161,7 +161,7 @@ class CIC_IDS2018(NetFlowDataset): file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CIC_IDS2018','original','cic_ids2018.csv') ) -class UNSW_NB15(NetFlowDataset): +class NF_UNSW_NB15_v2(NetFlowDataset): def __init__(self): super().__init__( features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','UNSW_NB15','features.pkl'), @@ -198,7 +198,6 @@ class Darknet(Dataset): df.drop(columns_to_exclude, axis=1, inplace=True) features = df.to_numpy() self.save_samples(adj,features,labels) - print('features:',features.shape) return adj,features,labels def _filling_adjacency_numpy(self,data,source_ip_index, destination_ip_index): @@ -213,7 +212,7 @@ class Darknet(Dataset): adjacency[mask] = True return adjacency -class BOT_IoT(NetFlowDataset): +class NF_BOT_IoT_v2(NetFlowDataset): def __init__(self): super().__init__( features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','BOT_IOT','features.pkl'), @@ -222,7 +221,7 @@ class BOT_IoT(NetFlowDataset): file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','BOT_IOT','original','bot_iot.csv') ) -class TON_IoT(NetFlowDataset): +class NF_TON_IoT_v2(NetFlowDataset): def __init__(self): # directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','TON_IOT','original'), super().__init__( @@ -274,7 +273,7 @@ class AWID3(Dataset): if(df[c].dtype == 'object' and c!='radiotap.dbm_antsignal'): print(c,df[c].unique(),len(df[c].unique())) df.drop(columns=alone,axis=1,inplace=True) - df['radiotap.dbm_antsignal'] = df['radiotap.dbm_antsignal'].apply(self._config_signal) + df['radiotap.dbm_antsignal'] = df['radiotap.dbm_antsignal'].apply(self._config_signal) # It contains a list labels = df['Label_1'].to_numpy() adj = self._filling_adjacency_numpy(data) df.drop(columns=['frame.time_delta','Label_1'],axis=1,inplace=True) diff --git a/datasets/examples/AWID3/adjacency.pkl b/datasets/examples/AWID3/adjacency.pkl deleted file mode 100644 index a5c68577c90860acf85655f1d89e74cf3d462728..0000000000000000000000000000000000000000 Binary files a/datasets/examples/AWID3/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/AWID3/features.pkl b/datasets/examples/AWID3/features.pkl deleted file mode 100644 index ca50813bf5fa740f9601769af166bf8e64173b35..0000000000000000000000000000000000000000 Binary files a/datasets/examples/AWID3/features.pkl and /dev/null differ diff --git a/datasets/examples/AWID3/labels.pkl b/datasets/examples/AWID3/labels.pkl deleted file mode 100644 index 19c62897db47f98b6a9ecb913a78eaf9837ffad6..0000000000000000000000000000000000000000 Binary files a/datasets/examples/AWID3/labels.pkl and /dev/null differ diff --git a/datasets/examples/BOT_IOT/adjacency.pkl b/datasets/examples/BOT_IOT/adjacency.pkl deleted file mode 100644 index 41d152eba638d00fe1910f6f08c6c2002c422564..0000000000000000000000000000000000000000 Binary files a/datasets/examples/BOT_IOT/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/BOT_IOT/features.pkl b/datasets/examples/BOT_IOT/features.pkl deleted file mode 100644 index b7150b31c3ca033c31801862ab1180ce3eb9ffac..0000000000000000000000000000000000000000 Binary files a/datasets/examples/BOT_IOT/features.pkl and /dev/null differ diff --git a/datasets/examples/BOT_IOT/labels.pkl b/datasets/examples/BOT_IOT/labels.pkl deleted file mode 100644 index fc0994e3acb87f0c08963108b62dac185d310024..0000000000000000000000000000000000000000 Binary files a/datasets/examples/BOT_IOT/labels.pkl and /dev/null differ diff --git a/datasets/examples/CICDDoS2019/adjacency.pkl b/datasets/examples/CICDDoS2019/adjacency.pkl deleted file mode 100644 index fb90ecc0ac2651f2ebe51c6e08200173db2fc3fa..0000000000000000000000000000000000000000 Binary files a/datasets/examples/CICDDoS2019/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/CICDDoS2019/features.pkl b/datasets/examples/CICDDoS2019/features.pkl deleted file mode 100644 index 62bec0a41cfa78ae0522fde10b22aa7a96b80dbc..0000000000000000000000000000000000000000 Binary files a/datasets/examples/CICDDoS2019/features.pkl and /dev/null differ diff --git a/datasets/examples/CICDDoS2019/labels.pkl b/datasets/examples/CICDDoS2019/labels.pkl deleted file mode 100644 index e544bb17247e5ba8c6f12b5e975b3f302e815ed0..0000000000000000000000000000000000000000 Binary files a/datasets/examples/CICDDoS2019/labels.pkl and /dev/null differ diff --git a/datasets/examples/CIC_IDS2018/adjacency.pkl b/datasets/examples/CIC_IDS2018/adjacency.pkl deleted file mode 100644 index a5b645030e7ddde2597ae85a9f40b0ae28b944b9..0000000000000000000000000000000000000000 Binary files a/datasets/examples/CIC_IDS2018/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/CIC_IDS2018/features.pkl b/datasets/examples/CIC_IDS2018/features.pkl deleted file mode 100644 index f38c60cce71eb09a0c26b63fc2be001d3bee7b5f..0000000000000000000000000000000000000000 Binary files a/datasets/examples/CIC_IDS2018/features.pkl and /dev/null differ diff --git a/datasets/examples/CIC_IDS2018/labels.pkl b/datasets/examples/CIC_IDS2018/labels.pkl deleted file mode 100644 index f522aee22d8ab127fd585443b0036b43d1b67ce6..0000000000000000000000000000000000000000 Binary files a/datasets/examples/CIC_IDS2018/labels.pkl and /dev/null differ diff --git a/datasets/examples/Darknet/adjacency.pkl b/datasets/examples/Darknet/adjacency.pkl deleted file mode 100644 index 17e3b4d1cb240f0f5d6e5a7cc0eaf0236d43c0da..0000000000000000000000000000000000000000 Binary files a/datasets/examples/Darknet/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/Darknet/features.pkl b/datasets/examples/Darknet/features.pkl deleted file mode 100644 index e9b20fe5f12fed056ee107dbeb727de4fd38045a..0000000000000000000000000000000000000000 Binary files a/datasets/examples/Darknet/features.pkl and /dev/null differ diff --git a/datasets/examples/Darknet/labels.pkl b/datasets/examples/Darknet/labels.pkl deleted file mode 100644 index 66930070c334ac355b0d98a994ca4780fac9bfad..0000000000000000000000000000000000000000 Binary files a/datasets/examples/Darknet/labels.pkl and /dev/null differ diff --git a/datasets/examples/TON_IOT/adjacency.pkl b/datasets/examples/TON_IOT/adjacency.pkl deleted file mode 100644 index af0f05afb3ab0644b3a1f90e1ded51ec3a5d2299..0000000000000000000000000000000000000000 Binary files a/datasets/examples/TON_IOT/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/TON_IOT/features.pkl b/datasets/examples/TON_IOT/features.pkl deleted file mode 100644 index 5da2741d9aa45cd0e33148deb84e5a594a943eb1..0000000000000000000000000000000000000000 Binary files a/datasets/examples/TON_IOT/features.pkl and /dev/null differ diff --git a/datasets/examples/TON_IOT/labels.pkl b/datasets/examples/TON_IOT/labels.pkl deleted file mode 100644 index e25710708f915cd06907902fce4dc9bd80c4937d..0000000000000000000000000000000000000000 Binary files a/datasets/examples/TON_IOT/labels.pkl and /dev/null differ diff --git a/datasets/examples/UNSW_NB15/adjacency.pkl b/datasets/examples/UNSW_NB15/adjacency.pkl deleted file mode 100644 index f28432bfb630f092a824a2001611062582babdba..0000000000000000000000000000000000000000 Binary files a/datasets/examples/UNSW_NB15/adjacency.pkl and /dev/null differ diff --git a/datasets/examples/UNSW_NB15/features.pkl b/datasets/examples/UNSW_NB15/features.pkl deleted file mode 100644 index 7322f3a5730b2a77f0df3c0c5b47b0d8d65e62c4..0000000000000000000000000000000000000000 Binary files a/datasets/examples/UNSW_NB15/features.pkl and /dev/null differ diff --git a/datasets/examples/UNSW_NB15/labels.pkl b/datasets/examples/UNSW_NB15/labels.pkl deleted file mode 100644 index de373d5412775e6f7d186578fa1790b46751e1f1..0000000000000000000000000000000000000000 Binary files a/datasets/examples/UNSW_NB15/labels.pkl and /dev/null differ diff --git a/datasets/examples/UNSW_NB15/original/.gitignore b/datasets/examples/UNSW_NB15/original/.gitignore deleted file mode 100644 index f59ec20aabf5842d237244ece8c81ab184faeac1..0000000000000000000000000000000000000000 --- a/datasets/examples/UNSW_NB15/original/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* \ No newline at end of file diff --git a/datasets/outlier_datasets.py b/datasets/outlier_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..10c13045e61c03151708b4fd59779543b47047dd --- /dev/null +++ b/datasets/outlier_datasets.py @@ -0,0 +1,314 @@ +import os + +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler +import pickle +from sklearn.preprocessing import LabelEncoder +import time +import datetime +import category_encoders as ce + +class Dataset: + def __init__(self,features_path='',adj_path='',labels_path='',directory=''): + self.features_path = features_path + self.adj_path = adj_path + self.labels_path = labels_path + self.directory = directory + + def _get_files(self): + return [os.path.join(self.directory,file) for file in os.listdir(self.directory) if os.path.isfile(os.path.join(self.directory, file)) and '.gitignore' not in file] + + def save_samples(self,adj,features,labels): + + with open(self.adj_path,'wb') as f: + pickle.dump(adj,f) + with open(self.features_path,'wb') as f: + pickle.dump(features,f) + with open(self.labels_path,'wb') as f: + pickle.dump(labels,f) + + def load_samples(self): + with open(self.adj_path,'rb') as f: + adj = pickle.load(f) + with open(self.features_path,'rb') as f: + features = pickle.load(f) + with open(self.labels_path,'rb') as f: + labels = pickle.load(f) + print('features:',features.shape,'adj',adj.shape,'labels',labels.shape) + return adj,features,labels + +class CIC_DDoS2019(Dataset): + def __init__(self): + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','CICDDoS2019','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','CICDDoS2019','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','CICDDoS2019','labels.pkl'), + directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','original') + ) + + def build(self,n_nodes,n_classes=2): + df = self._create_file_bc(n_nodes,n_classes) + for column in df.columns: + max_value = df.loc[df[column] != np.inf, column].max() + min_value = df.loc[df[column] != -np.inf, column].min() + df.loc[df[column] == np.inf, column] = max_value + df.loc[df[column] == -np.inf, column] = min_value + adj = self._filling_adjacency_numpy(df) + labels = df[' Label'].apply(lambda x: 0 if x == 'BENIGN' else 1).to_numpy() + columns_to_exclude = ['Unnamed: 0', 'Flow ID', ' Source IP',' Source Port',' Destination Port',' Flow Duration',' Protocol', ' Destination IP', ' Timestamp', 'SimillarHTTP',' Inbound',' Label'] + df.drop(columns_to_exclude, axis=1, inplace=True) + features = df.to_numpy() + scaler = MinMaxScaler() + features = scaler.fit_transform(features) + self.save_samples(adj,features,labels) + return adj, features, labels + + def _load_file(self,path,max_per_class,list_classes=[]): + df = pd.read_csv(path,low_memory=False) + df.dropna(axis=0, inplace=True) + normal_df = df[df[' Label'] == 'BENIGN'] + if(len(list_classes)): + df = df[df[' Label'].isin(list_classes)] + df = df.groupby([' Label']).apply(lambda x: x.sample(max_per_class)).reset_index(drop=True) + return df , normal_df + + def _create_file_bc(self,n_nodes,n_classes): + outlier_percentage = .1 + file_paths = self._get_files() + max_per_class = int(n_nodes * outlier_percentage / len(file_paths)) +1 + df_list = [] + benign_df = pd.DataFrame([]) + for path in file_paths: + class_name = path.split('\\')[-1].split('.')[0] + tmp = self._load_file(path,max_per_class,[class_name]) + df_list.append(tmp[0]) + benign_df = pd.concat([benign_df,tmp[1]],ignore_index=True) + print('finishing loading the file : {}'.format(path)) + df = pd.concat(df_list,ignore_index=True) + print(df.shape) + print(benign_df.shape) + benign_df = benign_df.sample(n=int(n_nodes * (1-outlier_percentage))).reset_index(drop=True) + print(benign_df.shape) + df = pd.concat([benign_df,df],ignore_index=True) + print(df.shape) + df = df.sample(n=n_nodes).reset_index(drop=True) + print(df.shape) + # print(df[' Label'].value_counts()) + # df = pd.read_csv(os.path.join(self.directory,'all.csv'),low_memory=False) + # df[' Label'] = df[' Label'].apply(lambda x: 0 if x == 'BENIGN' else 1) + # node_per_class = int(n_nodes/n_classes) + # df = df.groupby([' Label']).apply(lambda x: x.sample(node_per_class)).reset_index(drop=True) + return df + + def _filling_adjacency_numpy(self,data): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + + source_ips = data[' Source IP'].to_numpy() + destination_ips = data[' Destination IP'].to_numpy() + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) ) + adjacency[mask] = True + return adjacency + +class NetFlowDataset(Dataset): + def __init__(self,features_path,adj_path,labels_path,file): + super().__init__(features_path,adj_path,labels_path) + self.file = file + + def build(self,n_nodes): + outlier_percentage = .1 + df = pd.read_csv(self.file) + df = df.groupby(['Label']).apply(lambda x: x.sample(int(n_nodes * (1-outlier_percentage))) if pd.unique(x['Label'])[0] == 0 else x.sample(int(n_nodes * outlier_percentage))).reset_index(drop=True) + df = df.sample(frac=1).reset_index(drop=True) + print(df['Label'].value_counts()) + adj = self._filling_adjacency_numpy(df) + labels = df['Label'].to_numpy() + labels = labels.astype(np.bool_) + df.drop(['IPV4_SRC_ADDR','IPV4_DST_ADDR','Attack','Label','L4_SRC_PORT','L4_DST_PORT'],axis=1,inplace=True) + #df = pd.get_dummies(df,columns=['PROTOCOL','DNS_QUERY_TYPE','FTP_COMMAND_RET_CODE']) + + encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL']) + encoder.fit(df,labels) + df = encoder.transform(df) + + features = df.to_numpy() + scaler = MinMaxScaler() + features = scaler.fit_transform(features) + print("features:",features.shape) + self.save_samples(adj,features,labels) + return adj,features,labels + + def _filling_adjacency_numpy(self,data): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + + if 'bot_iot' in self.file: + data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'].apply(str) + data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'].apply(str) + data['L4_SRC_PORT'] = data['L4_SRC_PORT'].apply(str) + data['L4_DST_PORT'] = data['L4_DST_PORT'].apply(str) + data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR']+':'+data['L4_SRC_PORT'] + data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR']+':'+data['L4_DST_PORT'] + + source_ips = data['IPV4_SRC_ADDR'].to_numpy() + destination_ips = data['IPV4_DST_ADDR'].to_numpy() + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips) | (destination_ips[:, np.newaxis] == destination_ips)) + adjacency[mask] = True + return adjacency + +class NF_CIC_IDS2018_v2(NetFlowDataset): + def __init__(self): + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','CIC_IDS2018','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','CIC_IDS2018','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','CIC_IDS2018','labels.pkl'), + file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CIC_IDS2018','original','cic_ids2018.csv') + ) + +class NF_UNSW_NB15_v2(NetFlowDataset): + def __init__(self): + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','UNSW_NB15','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','UNSW_NB15','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','UNSW_NB15','labels.pkl'), + file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'UNSW_NB15','original','unsw_nb15.csv') + ) + +class Darknet(Dataset): + def __init__(self): + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','Darknet','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','Darknet','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','Darknet','labels.pkl') + ) + self.file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','Darknet','original','Darknet.csv') + + def _to_binary_classification(self,x): + if 'Non' in x: + return 0 + else: + return 1 + + def build(self,n_nodes,n_classes=2): + df = pd.read_csv(self.file) + df.dropna(axis=0, inplace=True) + df['Label'] = df['Label'].apply(self._to_binary_classification) + df = df.groupby(['Label']).apply(lambda x: x.sample(int(n_nodes/n_classes))).reset_index(drop=True) + df = df.sample(n=n_nodes).reset_index(drop=True) + data = df.to_numpy() + adj = self._filling_adjacency_numpy(data,1,3) + labels = df['Label'].to_numpy() + columns_to_exclude = ['Flow ID', 'Src IP','Src Port', 'Dst IP','Dst Port', 'Timestamp','Label','Label.1','Protocol','Flow Duration'] + df.drop(columns_to_exclude, axis=1, inplace=True) + features = df.to_numpy() + self.save_samples(adj,features,labels) + print('features:',features.shape) + return adj,features,labels + + def _filling_adjacency_numpy(self,data,source_ip_index, destination_ip_index): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + source_ips = data[:, source_ip_index] + destination_ips = data[:, destination_ip_index] + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips) | (destination_ips[:, np.newaxis] == destination_ips)) + adjacency[mask] = True + return adjacency + +class NF_BOT_IoT_v2(NetFlowDataset): + def __init__(self): + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','BOT_IOT','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','BOT_IOT','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','BOT_IOT','labels.pkl'), + file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','BOT_IOT','original','bot_iot.csv') + ) + +class NF_TON_IoT_v2(NetFlowDataset): + def __init__(self): + # directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','TON_IOT','original'), + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','TON_IOT','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','TON_IOT','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','TON_IOT','labels.pkl'), + file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','TON_IOT','original','ton_iot.csv') + ) + +class AWID3(Dataset): + def __init__(self): + super().__init__( + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','AWID3','features.pkl'), + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','AWID3','adjacency.pkl'), + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'outlier','AWID3','labels.pkl'), + directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples', 'AWID3','original') + ) + + def _config_signal(self,x): + words = str(x).split('-') + return np.mean([float(i)*-1 for i in words if i!='']) + + def build(self,n_nodes): + outlier_percentage = .1 + path = os.path.join(os.getcwd(),'Ghypeddings','datasets','examples','AWID3','original','awid3.csv') + df = pd.read_csv(path) + df['Label'] = df['Label'].apply(lambda x: 0 if 'Normal' in x else 1) + df = df.groupby(['Label']).apply(lambda x: x.sample(int(n_nodes*(1-outlier_percentage))) if pd.unique(x['Label'])[0] == 0 else x.sample(int(n_nodes*outlier_percentage)) ).reset_index(drop=True) + print(df['Label'].value_counts()) + df = df.sample(frac=1).reset_index(drop=True) + data=df[['ip.src','ip.dst']] + df.dropna(axis=1, inplace=True) + to_drop = ['frame.number','frame.time','radiotap.timestamp.ts','frame.time_delta_displayed','frame.time_epoch','frame.time_relative','wlan.duration','wlan.ra'] + df.drop(columns=to_drop,axis=1,inplace=True) + alone = [] + for c in df.columns: + if(len(df[c].unique()) == 1): + alone.append(c) + elif len(df[c].unique()) == 2: + df = pd.get_dummies(df,columns=[c],drop_first=True) + elif len(df[c].unique()) <=8: + df = pd.get_dummies(df,columns=[c]) + elif len(df[c].unique()) <=15: + labels = df['Label'] + df.drop(columns=['Label'],axis=1,inplace=True) + encoder = ce.TargetEncoder(cols=[c]) + encoder.fit(df,labels) + df = encoder.transform(df) + df['Label']=labels + else: + if(df[c].dtype == 'object' and c!='radiotap.dbm_antsignal'): + print(c,df[c].unique(),len(df[c].unique())) + df.drop(columns=alone,axis=1,inplace=True) + df['radiotap.dbm_antsignal'] = df['radiotap.dbm_antsignal'].apply(self._config_signal) + labels = df['Label_1'].to_numpy() + adj = self._filling_adjacency_numpy(data) + df.drop(columns=['frame.time_delta','Label_1'],axis=1,inplace=True) + features = df.to_numpy() + scaler = StandardScaler() + features = scaler.fit_transform(features) + # scaler = MinMaxScaler() + # features = scaler.fit_transform(features) + self.save_samples(adj=adj,features=features,labels=labels) + return adj,features,labels + + def _filling_adjacency_numpy(self,data): + N = data.shape[0] + try: + adjacency = np.zeros((N,N), dtype=bool) + except Exception as e: + print(f"An error occurred: {e}") + source_ips = data['ip.src'].to_numpy() + destination_ips = data['ip.dst'].to_numpy() + mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips) | (destination_ips[:, np.newaxis] == destination_ips) ) + adjacency[mask] = True + np.fill_diagonal(adjacency, True) + return adjacency \ No newline at end of file diff --git a/datasets/test_dataset.py b/datasets/repetition_datasets.py similarity index 90% rename from datasets/test_dataset.py rename to datasets/repetition_datasets.py index 97c72e89efdca8e20b5d0f25696146b88096ced5..14b125651b8695dbe55af39c24aa011ca5ec4cf6 100644 --- a/datasets/test_dataset.py +++ b/datasets/repetition_datasets.py @@ -8,7 +8,6 @@ import pickle from sklearn.preprocessing import LabelEncoder import time import datetime -import progressbar import category_encoders as ce class Dataset: @@ -21,10 +20,15 @@ class Dataset: def _get_files(self): return [os.path.join(self.directory,file) for file in os.listdir(self.directory) if os.path.isfile(os.path.join(self.directory, file)) and '.gitignore' not in file] - def save_samples(self,adj,features,labels,dim): - features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'features_{dim}.pkl') - adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'adjacency_{dim}.pkl') - labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'labels_{dim}.pkl') + def save_samples(self,adj,features,labels,repetition): + # features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'features_{repetition}.pkl') + # adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'adjacency_{repetition}.pkl') + # labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'labels_{repetition}.pkl') + + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'features_{repetition}.pkl') + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'adjacency_{repetition}.pkl') + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019',f'labels_{repetition}.pkl') + with open(adj_path,'wb') as f: pickle.dump(adj,f) @@ -33,10 +37,10 @@ class Dataset: with open(labels_path,'wb') as f: pickle.dump(labels,f) - def load_samples(self,dim): - features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','features_{}.pkl'.format(dim)) - adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','adjacency_{}.pkl'.format(dim)) - labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','labels_{}.pkl'.format(dim)) + def load_samples(self,repetition): + features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','TON_IOT',f'features_{repetition}.pkl') + adj_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','TON_IOT',f'adjacency_{repetition}.pkl') + labels_path= os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','TON_IOT',f'labels_{repetition}.pkl') with open(adj_path,'rb') as f: adj = pickle.load(f) with open(features_path,'rb') as f: @@ -52,7 +56,7 @@ class CIC_DDoS2019(Dataset): directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CICDDoS2019','original') ) - def build(self,n_nodes,n_classes=2,dim=20): + def build(self,n_nodes,n_classes=2,repetition=1): df = self._create_file_bc(n_nodes,n_classes) for column in df.columns: max_value = df.loc[df[column] != np.inf, column].max() @@ -66,7 +70,7 @@ class CIC_DDoS2019(Dataset): features = df.to_numpy() scaler = MinMaxScaler() features = scaler.fit_transform(features) - self.save_samples(adj,features,labels,dim) + self.save_samples(adj,features,labels,repetition) return adj, features, labels def _load_file(self,path,max_per_class,list_classes=[]): @@ -114,8 +118,8 @@ class NetFlowDataset(Dataset): super().__init__(features_path,adj_path,labels_path) self.file = file - def build(self,n_nodes,n_classes=2): - df = pd.read_csv(self.file) + def build(self,n_nodes,n_classes=2,repetition=1): + df = pd.read_csv(self.file) df = df.groupby(['Label']).apply(lambda x: x.sample(int(n_nodes/n_classes))).reset_index(drop=True) df = df.sample(frac=1).reset_index(drop=True) adj = self._filling_adjacency_numpy(df) @@ -132,7 +136,7 @@ class NetFlowDataset(Dataset): scaler = MinMaxScaler() features = scaler.fit_transform(features) print("features:",features.shape) - self.save_samples(adj,features,labels) + self.save_samples(adj,features,labels,repetition) return adj,features,labels def _filling_adjacency_numpy(self,data): @@ -156,7 +160,7 @@ class NetFlowDataset(Dataset): adjacency[mask] = True return adjacency -class CIC_IDS2018(NetFlowDataset): +class NF_CIC_IDS2018_v2(NetFlowDataset): def __init__(self): super().__init__( features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CIC_IDS2018','features.pkl'), @@ -165,7 +169,7 @@ class CIC_IDS2018(NetFlowDataset): file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','CIC_IDS2018','original','cic_ids2018.csv') ) -class UNSW_NB15(NetFlowDataset): +class NF_UNSW_NB15_v2(NetFlowDataset): def __init__(self): super().__init__( features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','UNSW_NB15','features.pkl'), @@ -189,7 +193,7 @@ class Darknet(Dataset): else: return 1 - def build(self,n_nodes,n_classes=2): + def build(self,n_nodes,n_classes=2,repetition=1): df = pd.read_csv(self.file) df.dropna(axis=0, inplace=True) df['Label'] = df['Label'].apply(self._to_binary_classification) @@ -201,7 +205,7 @@ class Darknet(Dataset): columns_to_exclude = ['Flow ID', 'Src IP','Src Port', 'Dst IP','Dst Port', 'Timestamp','Label','Label.1','Protocol','Flow Duration'] df.drop(columns_to_exclude, axis=1, inplace=True) features = df.to_numpy() - self.save_samples(adj,features,labels) + self.save_samples(adj,features,labels,repetition) print('features:',features.shape) return adj,features,labels @@ -217,7 +221,7 @@ class Darknet(Dataset): adjacency[mask] = True return adjacency -class BOT_IoT(NetFlowDataset): +class NF_BOT_IoT_v2(NetFlowDataset): def __init__(self): super().__init__( features_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','BOT_IOT','features.pkl'), @@ -226,7 +230,7 @@ class BOT_IoT(NetFlowDataset): file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','BOT_IOT','original','bot_iot.csv') ) -class TON_IoT(NetFlowDataset): +class NF_TON_IoT_v2(NetFlowDataset): def __init__(self): # directory=os.path.join(os.path.dirname(os.path.abspath(__file__)),'examples','TON_IOT','original'), super().__init__( @@ -249,7 +253,7 @@ class AWID3(Dataset): words = str(x).split('-') return np.mean([float(i)*-1 for i in words if i!='']) - def build(self,n_nodes): + def build(self,n_nodes,repetition): path = os.path.join(os.getcwd(),'Ghypeddings','datasets','examples','AWID3','original','awid3.csv') df = pd.read_csv(path) df['Label'] = df['Label'].apply(lambda x: 0 if 'Normal' in x else 1) @@ -287,7 +291,7 @@ class AWID3(Dataset): features = scaler.fit_transform(features) # scaler = MinMaxScaler() # features = scaler.fit_transform(features) - self.save_samples(adj=adj,features=features,labels=labels) + self.save_samples(adj=adj,features=features,labels=labels,repetition=repetition) return adj,features,labels def _filling_adjacency_numpy(self,data): diff --git a/datasets/utils.py b/datasets/utils.py index cab0b352d0ee12f4fbb580e29583e2e676adfb81..a65a154f3cda8bc37e70344a727c1e8c690b6904 100644 --- a/datasets/utils.py +++ b/datasets/utils.py @@ -11,7 +11,7 @@ def hyperbolicity(adj, num_samples): curr_time = time.time() hyps = [] G = nx.from_numpy_array(adj) - for i in tqdm(range(num_samples)): + for _ in tqdm(range(num_samples)): node_tuple = np.random.choice(G.nodes(), 4, replace=False) s = [] try: @@ -28,5 +28,5 @@ def hyperbolicity(adj, num_samples): hyps.append((s[-1] - s[-2]) / 2) except Exception as e: continue - print('Time for hyp: ', time.time() - curr_time) + print('Time for hyp: ', time.time() - curr_time , 'hyp:', max(hyps)) return max(hyps)