Skip to content
Snippets Groups Projects
Commit 0ca7ed7f authored by yacinetouahria's avatar yacinetouahria
Browse files

final push

parent 989d2e17
No related branches found
No related tags found
1 merge request!3final push
Showing
with 2525 additions and 0 deletions
This diff is collapsed.
%% Cell type:code id:8620fcf3 tags:
``` python
from Ghypeddings import *
import pandas as pd
import numpy as np
import pickle
```
%% Cell type:code id:f9f909f0 tags:
``` python
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , f1_score , recall_score , precision_score , roc_auc_score
```
%% Cell type:code id:2ffdf671 tags:
``` python
data = np.genfromtxt('hgcae_unsw_nb_5_embeddings_euc.csv', delimiter=',')
X = data[:,:-1]
y = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
```
%% Cell type:code id:0024ea51 tags:
``` python
def best_score_params(estimator,params):
grid_search = GridSearchCV(estimator=estimator, param_grid=params, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(accuracy,f1,recall,precision,roc_auc)
cv_results = grid_search.cv_results_
df = pd.DataFrame(cv_results)
return df
```
%% Cell type:code id:d1333b28 tags:
``` python
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
def calculate_metrics(y_true,y_pred):
acc = accuracy_score(y_true,y_pred)
f1 = f1_score(y_true,y_pred)
rec = recall_score(y_true,y_pred)
pre = precision_score(y_true,y_pred)
roc = roc_auc_score(y_true,y_pred)
return acc,f1,rec,pre,roc
```
%% Cell type:code id:efe4163c tags:
``` python
from sklearn.cluster import KMeans
def kmeans(X,y,n_clusters,outlier_percentage=.1):
model = KMeans(n_clusters=n_clusters)
model.fit(X)
y_pred = model.predict(X)
distances = model.transform(X).min(axis=1)
threshold = np.percentile(distances, 100 * (1 - outlier_percentage))
outliers = distances > threshold
return calculate_metrics(y,outliers)
print('2:')
print(kmeans(X,y,2))
print('3:')
print(kmeans(X,y,3))
print('5:')
print(kmeans(X,y,5))
print('10:')
print(kmeans(X,y,10))
print('20:')
print(kmeans(X,y,20))
print('40:')
print(kmeans(X,y,50))
print('50:')
print(kmeans(X,y,50))
```
%% Output
2:
(0.9492, 0.746, 0.746, 0.746, 0.8588888888888888)
3:
(0.9288, 0.644, 0.644, 0.644, 0.8022222222222223)
5:
(0.9548, 0.774, 0.774, 0.774, 0.8744444444444445)
10:
(0.932, 0.66, 0.66, 0.66, 0.8111111111111111)
20:
(0.9428, 0.714, 0.714, 0.714, 0.841111111111111)
40:
(0.9448, 0.724, 0.724, 0.724, 0.8466666666666667)
50:
(0.9412, 0.706, 0.706, 0.706, 0.8366666666666667)
%% Cell type:markdown id:170e9b21 tags:
# LE reste
%% Cell type:code id:da99d4c2 tags:
``` python
from sklearn.cluster import DBSCAN
def dbscan(X,y,min_samples=5):
dbscan = DBSCAN(eps=0.1, min_samples=min_samples)
labels = dbscan.fit_predict(X)
outliers = labels == -1
return calculate_metrics(y,outliers)
print('5:')
print(dbscan(X,y,min_samples = 5))
print('10:')
print(dbscan(X,y,min_samples = 10))
print('20:')
print(dbscan(X,y,min_samples = 20))
print('50:')
print(dbscan(X,y,min_samples = 50))
print('100:')
print(dbscan(X,y,min_samples = 100))
```
%% Output
5:
(0.8908, 0.024999999999999998, 0.014, 0.11666666666666667, 0.5011111111111112)
10:
(0.8926, 0.1408, 0.088, 0.352, 0.535)
20:
(0.8868, 0.23097826086956522, 0.17, 0.3601694915254237, 0.5682222222222222)
50:
(0.8664, 0.2707423580786026, 0.248, 0.2980769230769231, 0.5915555555555555)
100:
(0.789, 0.23606082548877624, 0.326, 0.18501702610669693, 0.5832222222222222)
%% Cell type:code id:54cd340d tags:
``` python
from sklearn.ensemble import IsolationForest
def isolation_forest(X,y,anomalies_percentage = 0.1):
model = IsolationForest(contamination=anomalies_percentage)
model.fit(X)
y_pred = model.predict(X)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1]= 1
return calculate_metrics(y,y_pred)
isolation_forest(X,y)
```
%% Output
(0.8586, 0.2922922922922923, 0.292, 0.2925851703406814, 0.6067777777777777)
%% Cell type:code id:f63108dd tags:
``` python
from sklearn.cluster import KMeans
def kmeans(X,y,n_clusters,outlier_percentage=.1):
model = KMeans(n_clusters=n_clusters)
model.fit(X)
y_pred = model.predict(X)
distances = model.transform(X).min(axis=1)
threshold = np.percentile(distances, 100 * (1 - outlier_percentage))
outliers = distances > threshold
return calculate_metrics(y,outliers)
print('2:')
print(kmeans(X,y,2))
print('3:')
print(kmeans(X,y,3))
print('5:')
print(kmeans(X,y,5))
print('10:')
print(kmeans(X,y,10))
print('20:')
print(kmeans(X,y,20))
print('40:')
print(kmeans(X,y,50))
print('50:')
print(kmeans(X,y,50))
```
%% Output
2:
(0.8486, 0.24072216649949846, 0.24, 0.2414486921529175, 0.5781111111111111)
3:
(0.8476, 0.23647294589178355, 0.236, 0.23694779116465864, 0.5757777777777777)
5:
(0.8516, 0.258, 0.258, 0.258, 0.5877777777777777)
10:
(0.8248, 0.124, 0.124, 0.124, 0.5133333333333334)
20:
(0.8352, 0.176, 0.176, 0.176, 0.5422222222222222)
40:
(0.84, 0.20000000000000004, 0.2, 0.2, 0.5555555555555555)
50:
(0.842, 0.20999999999999996, 0.21, 0.21, 0.5611111111111111)
%% Cell type:code id:7abc7f6e tags:
``` python
from sklearn.neighbors import LocalOutlierFactor
def local_outlier_factor(X,y,n_neighbors=20,outlier_percentage=.1):
lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=outlier_percentage)
y_pred = lof.fit_predict(X)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
return calculate_metrics(y,y_pred)
print('5:')
print(local_outlier_factor(X,y,n_neighbors=5))
print('10:')
print(local_outlier_factor(X,y,n_neighbors=5))
print('15:')
print(local_outlier_factor(X,y,n_neighbors=5))
print('20')
print(local_outlier_factor(X,y,n_neighbors=20))
print('30')
print(local_outlier_factor(X,y,n_neighbors=20))
print('50')
print(local_outlier_factor(X,y,n_neighbors=50))
```
%% Output
5:
(0.8272, 0.136, 0.136, 0.136, 0.52)
10:
(0.8272, 0.136, 0.136, 0.136, 0.52)
15:
(0.8272, 0.136, 0.136, 0.136, 0.52)
20
(0.83, 0.15, 0.15, 0.15, 0.5277777777777778)
30
(0.83, 0.15, 0.15, 0.15, 0.5277777777777778)
50
(0.8572, 0.286, 0.286, 0.286, 0.6033333333333333)
%% Cell type:code id:40d8f286 tags:
``` python
from sklearn.svm import OneClassSVM
def one_class_svm(X,y, kernel='rbf',nu=0.1):
model = OneClassSVM(kernel=kernel, nu=nu)
model.fit(X)
y_pred = model.predict(X)
y_pred[y_pred == 1]=0
y_pred[y_pred == -1] = 1
return calculate_metrics(y,y_pred)
print('rbf:')
print(one_class_svm(X,y,kernel='rbf'))
print('linear:')
print(one_class_svm(X,y,kernel='linear'))
print('poly:')
print(one_class_svm(X,y,kernel='poly'))
print('sigmoid:')
print(one_class_svm(X,y,kernel='sigmoid'))
```
%% Output
rbf:
(0.7028, 0.23638232271325796, 0.46, 0.1590594744121715, 0.594888888888889)
linear:
(0.1028, 0.17868912486268765, 0.976, 0.09834744054816606, 0.4908888888888889)
poly:
(0.1, 0.18181818181818182, 1.0, 0.1, 0.5)
sigmoid:
(0.424, 0.17383820998278832, 0.606, 0.10147354320160751, 0.5048888888888889)
%% Cell type:code id:2f136115 tags:
``` python
```
%% Cell type:code id:abb5b48b tags:
``` python
```
%% Cell type:code id:78c516c1 tags:
``` python
```
%% Cell type:code id:4480a184 tags:
``` python
```
%% Cell type:code id:f6220f3c tags:
``` python
```
%% Cell type:code id:d6b34d26 tags:
``` python
```
%% Cell type:code id:bf96a516 tags:
``` python
import numpy as np
import os
import pandas as pd
```
%% Cell type:code id:3399745c tags:
``` python
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
def calculate_metrics(y_true,y_pred):
acc = accuracy_score(y_true,y_pred)
f1 = f1_score(y_true,y_pred)
rec = recall_score(y_true,y_pred)
pre = precision_score(y_true,y_pred)
roc = roc_auc_score(y_true,y_pred)
return [round(acc,4),round(f1,4),round(rec,4),round(pre,4),round(roc,4)]
```
%% Cell type:code id:f2790afd tags:
``` python
def group_clusters(y_true,y_pred):
pairs = []
for k,v in zip(y_true,y_pred):
tup = (int(k),v)
pairs.append(tup)
occurrences = {}
for item in pairs:
if item in occurrences:
occurrences[item] += 1
else:
occurrences[item] = 1
a = sorted(occurrences.items(), key=lambda item: item[1])[::-1]
normal,attack = [],[]
for item in a:
if item[0][1] not in normal and item[0][1] not in attack:
if item[0][0] == 0:
normal.append(item[0][1])
else:
attack.append(item[0][1])
for i in normal:
y_pred[y_pred == i] = 0
for j in attack:
y_pred[y_pred == j] = 1
return y_pred
```
%% Cell type:code id:49759d76 tags:
``` python
from sklearn.cluster import AgglomerativeClustering
def agglomerative_clustering(X,y,n_clusters = 100):
model = AgglomerativeClustering(n_clusters=n_clusters)
labels = model.fit_predict(X)
labels = group_clusters(y,labels)
return calculate_metrics(y,labels)
```
%% Cell type:code id:b4bbd3f7 tags:
``` python
from sklearn.cluster import DBSCAN
def dbscan(X,y,eps=1e-2,min_samples=20):
model = DBSCAN(eps=eps, min_samples=min_samples)
y_pred = model.fit_predict(X)
mask = y_pred != -1
y_true_filtered = y[mask]
y_pred_filtered = y_pred[mask]
y_pred_filtered = group_clusters(y_true_filtered,y_pred_filtered)
return calculate_metrics(y_true_filtered,y_pred_filtered)
```
%% Cell type:code id:05d6a66a tags:
``` python
import skfuzzy as fuzz
def fuzzy_c_mean(X,y,n_clusters=10,power=2,error=0.01,maxiter=1000,init=None):
X_transposed = np.transpose(X)
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X_transposed, n_clusters, power, error=error, maxiter=maxiter, init=init)
y_pred = np.argmax(u, axis=0)
y_pred = group_clusters(y,y_pred)
return calculate_metrics(y,y_pred)
```
%% Cell type:code id:0c26e329 tags:
``` python
from sklearn.mixture import GaussianMixture
def gaussian_mixture(X,y,n_components=20):
model = GaussianMixture(n_components=n_components)
y_pred = model.fit_predict(X)
y_pred = group_clusters(y,y_pred)
return calculate_metrics(y,y_pred)
```
%% Cell type:code id:f4226733 tags:
``` python
from sklearn.cluster import KMeans
def kmeans(X,y,n_clusters=10,n_init=10):
model = KMeans(n_clusters=n_clusters,n_init=n_init)
model.fit(X)
y_pred = model.labels_
y_pred = group_clusters(y,y_pred)
return calculate_metrics(y,y_pred)
```
%% Cell type:code id:6a07e67b tags:
``` python
from sklearn.cluster import MeanShift
def mean_shift(X,y):
y_pred = MeanShift(n_jobs=-1,max_iter=10).fit_predict(X)
y_pred = group_clusters(y,y_pred)
return calculate_metrics(y,y_pred)
```
%% Cell type:code id:23bac324 tags:
``` python
datasets = ['awid3','bot_iot','ddos2019','darknet','ids2018','ton_iot','unsw_nb15']
models = ['hgcae','pvae']
dim = 20
```
%% Cell type:code id:15cd3d01 tags:
``` python
for model in models:
for dataset in datasets:
print(dataset,"--------------------------")
file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')
data = np.genfromtxt(file, delimiter=",", usemask=True)
X = data[:,:-1]
y = data[:,-1]
results =[]
results.append(agglomerative_clustering(X,y))
print('agglomerative',results[-1])
results.append(dbscan(X,y))
print('dbscan',results[-1])
results.append(fuzzy_c_mean(X,y))
print('fuzzy c mean',results[-1])
results.append(gaussian_mixture(X,y))
print('gaussian',results[-1])
results.append(kmeans(X,y))
print('kmeans',results[-1])
results.append(mean_shift(X,y))
print('mean shift',results[-1])
df = pd.DataFrame(np.array(results))
df.to_csv(f'{model}_{dataset}.csv',index=False)
```
%% Cell type:code id:ab2d406e tags:
``` python
dataset = 'ton_iot'
model = 'hgcae'
dim = 20
file = os.path.join(os.getcwd(),'embeddings',f'{model}_{dataset}_{dim}_embeddings_euc.csv')
data = np.genfromtxt(file, delimiter=",", usemask=True)
X = data[:,:-1]
y = data[:,-1]
results =[]
# c = [60,70,80]
# for i in c:
# results.append(agglomerative_clustering(X,y,n_clusters = i))
# print('agglomerative',results[-1])
#results.append(dbscan(X,y))
#print('dbscan',results[-1])
# c = [2,5,10,20,50,70,100,150,200]
# for i in c:
# results.append(fuzzy_c_mean(X,y,n_clusters=i))
# print('fuzzy c mean',results[-1])
# c = [2,5,10,20,50,100,200]
# for i in c:
# results.append(gaussian_mixture(X,y,n_components=i))
# print('gaussian',results[-1])
# c = []
# results.append(kmeans(X,y))
# print('kmeans',results[-1])
# results.append(mean_shift(X,y))
# print('mean shift',results[-1])
```
%% Output
dbscan [0.4744, 0.6435, 1.0, 0.4744, 0.5]
%% Cell type:code id:fd862da6 tags:
``` python
```
%% Cell type:code id:c7760d5f tags:
``` python
```
%% Cell type:code id:da32a2d7 tags:
``` python
```
# NOTE: this file calcules the hyperbolicity of all the datasets
import numpy as np
from Ghypeddings.datasets.utils import hyperbolicity
from Ghypeddings.datasets.datasets import CIC_DDoS2019,AWID3,NF_CIC_IDS2018_v2,Darknet,NF_TON_IoT_v2,NF_BOT_IoT_v2,NF_UNSW_NB15_v2
datasets = [['ddos2019',CIC_DDoS2019],
['awid3',AWID3],
['ids2018',NF_CIC_IDS2018_v2],
['darknet',Darknet],
['ton_iot',NF_TON_IoT_v2],
['bot_iot',NF_BOT_IoT_v2],
['unsw_nb',NF_UNSW_NB15_v2]]
for dataset in datasets:
h_mean = []
for i in range(5):
adj,_,_ = dataset[1]().load_samples(repetition=0)
h = hyperbolicity(adj,num_samples=10)
h_mean.append(h)
print(dataset[0],np.mean(h_mean))
\ No newline at end of file
# NOTE: add this .py file inside the folder containing all the files of single dataset and run it
# to concatenate all those files into a single one
# this helps sampling a representative data
# this is the first OTHER script used after downloading the datasets
import os
import pandas as pd
def run():
directory = os.getcwd()
files = [f for f in os.listdir(directory) if os.path.isfile(f) and '.py' not in f]
all = []
for file in files:
df = pd.read_csv(file,low_memory=False)
all.append(df)
data = pd.concat(all)
# change the name of the dataset here
path = os.path.join(directory,'ton_iot.csv')
data.to_csv(path,index=False)
if __name__ == "__main__":
run()
\ No newline at end of file
In order to generate representative snapshots we had to reorganize the files of the dataset.
You have to execute those scripts in the following order:
1- group attack: to group all the attacks into single file
2- group normal attack: group the attack file with the normal file
3- execute one of the three scripts which correspond each to a specific dataset
\ No newline at end of file
# NOTE: This file generates the snapshots from the darknet dataset. It does everything starting by the cleaning and moving to data spliting.
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle
def _to_binary_classification(x):
if 'Non' in x:
return 0
else:
return 1
def _filling_adjacency_numpy(data):
N = data.shape[0]
try:
adjacency = np.zeros((N,N), dtype=bool)
except Exception as e:
print(f"An error occurred: {e}")
source_ips = data['Src IP'].to_numpy()
destination_ips = data['Dst IP'].to_numpy()
mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) )
adjacency[mask] = True
return adjacency
def save_samples(adj,features,labels,adj_path,features_path,labels_path):
with open(adj_path,'wb') as f:
pickle.dump(adj,f)
with open(features_path,'wb') as f:
pickle.dump(features,f)
with open(labels_path,'wb') as f:
pickle.dump(labels,f)
nnodes = 1000
overlap = 0.25
directory = os.path.join(os.getcwd(),f'darknet_snapshots_{int(overlap*100)}_{nnodes}')
os.makedirs(directory)
df = pd.read_csv('other/darknet/all.csv')
df.dropna(axis=0,inplace=True)
df = df.reset_index(drop=True)
df['Label'] = df['Label'].apply(_to_binary_classification)
columns_to_exclude = ['Flow ID', 'Src IP','Src Port', 'Dst IP','Dst Port', 'Timestamp','Label','Label.1','Protocol','Flow Duration']
columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude]
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
i=0
j=0
while i< df.shape[0]:
print('snapshot:',j)
if df.shape[0] > nnodes:
data = df.iloc[:nnodes,:].copy()
adj = _filling_adjacency_numpy(data)
labels = data['Label'].to_numpy()
data.drop(columns_to_exclude, axis=1, inplace=True)
features = data.to_numpy()
save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
j+=1
i+=nnodes
df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True)
df = df.reset_index(drop=True)
print(np.sum(labels),len(labels)-np.sum(labels))
else:
adj = _filling_adjacency_numpy(df)
labels = data['Label'].to_numpy()
data.drop(columns_to_exclude, axis=1, inplace=True)
features = data.to_numpy()
save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
j+=1
i+=df.shape[0]
# NOTE: This file generates the snapshots from the ddos2019 dataset. It does everything starting by the cleaning and moving to data spliting.
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle
def _filling_adjacency_numpy(data):
N = data.shape[0]
try:
adjacency = np.zeros((N,N), dtype=bool)
except Exception as e:
print(f"An error occurred: {e}")
source_ips = data[' Source IP'].to_numpy()
destination_ips = data[' Destination IP'].to_numpy()
mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) )
adjacency[mask] = True
return adjacency
def save_samples(adj,features,labels,adj_path,features_path,labels_path):
with open(adj_path,'wb') as f:
pickle.dump(adj,f)
with open(features_path,'wb') as f:
pickle.dump(features,f)
with open(labels_path,'wb') as f:
pickle.dump(labels,f)
nnodes = 1000
overlap = 0.25
directory = os.path.join(os.getcwd(),f'ddos2019_snapshots_{int(overlap*100)}_{nnodes}')
os.makedirs(directory)
df = pd.read_csv('other/ddos/all.csv')
df.dropna(axis=0,inplace=True)
df = df.reset_index(drop=True)
for column in df.columns:
max_value = df.loc[df[column] != np.inf, column].max()
min_value = df.loc[df[column] != -np.inf, column].min()
df.loc[df[column] == np.inf, column] = max_value
df.loc[df[column] == -np.inf, column] = min_value
columns_to_exclude = ['Flow ID', ' Source IP',' Source Port',' Destination Port',' Flow Duration',' Protocol', ' Destination IP', ' Timestamp', 'SimillarHTTP',' Inbound',' Label']
columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude]
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
i=0
j=0
while i< df.shape[0]:
print('snapshot:',j)
if df.shape[0] > nnodes:
data = df.iloc[:nnodes,:].copy()
adj = _filling_adjacency_numpy(data)
labels = data[' Label'].to_numpy()
data.drop(columns_to_exclude, axis=1, inplace=True)
features = data.to_numpy()
save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
j+=1
i+=nnodes
df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True)
df = df.reset_index(drop=True)
print(np.sum(labels),len(labels)-np.sum(labels))
else:
adj = _filling_adjacency_numpy(df)
labels = data[' Label'].to_numpy()
data.drop(columns_to_exclude, axis=1, inplace=True)
features = data.to_numpy()
save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
j+=1
i+=df.shape[0]
# NOTE: here we have only the script for the TON_IOT dataset but the rest of the dataset follow the same logic
# It startes by extract the classes of attacks and the normal behaviour and then sort them by class and then randomly select some raws from a randomly selected category
# the rows are sorted by the timestamp
import pandas as pd
import os
import random
def _to_binary_classification(x):
if 'Non' in x:
return 0
else:
return 1
backdoor = pd.read_csv(os.path.join('ton_iot','ton_iot_backdoor.csv'))
ddos = pd.read_csv(os.path.join('ton_iot','ton_iot_ddos.csv'))
dos = pd.read_csv(os.path.join('ton_iot','ton_iot_dos.csv'))
mitm = pd.read_csv(os.path.join('ton_iot','ton_iot_mitm.csv'))
password = pd.read_csv(os.path.join('ton_iot','ton_iot_password.csv'))
ransomware = pd.read_csv(os.path.join('ton_iot','ton_iot_ransomware.csv'))
scanning = pd.read_csv(os.path.join('ton_iot','ton_iot_scanning.csv'))
xss = pd.read_csv(os.path.join('ton_iot','ton_iot_xss.csv'))
injection = pd.read_csv(os.path.join('ton_iot','ton_iot_injection.csv'))
print('backdoor',backdoor.shape[0])
print('ddos',ddos.shape[0])
print('dos',dos.shape[0])
print('mitm',mitm.shape[0])
print('password',password.shape[0])
print('ransomware',ransomware.shape[0])
print('scanning',scanning.shape[0])
print('xss',xss.shape[0])
print('injection',injection.shape[0])
m = backdoor.shape[0] + ddos.shape[0] + dos.shape[0] + mitm.shape[0] + password.shape[0] + ransomware.shape[0] + scanning.shape[0] + xss.shape[0] + injection.shape[0]
files = os.listdir(directory)
normal = pd.read_csv('ton_iot/normal.csv')
attack = pd.read_csv('ton_iot/attacks.csv')
m = normal.shape[0] + attack.shape[0]
all = pd.DataFrame()
i=0
while i< m:
print(i,"-",m)
if normal.shape[0]>0:
k = random.randint(10,20)
if normal.shape[0] >k:
all = pd.concat([all,normal.iloc[:k,:]],axis=0)
normal.drop(range(k),axis=0,inplace=True)
normal = normal.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,normal],axis=0)
i+=normal.shape[0]
normal = pd.DataFrame()
if attack.shape[0]>0:
k = random.randint(10,20)
if attack.shape[0] >k:
all = pd.concat([all,attack.iloc[:k,:]],axis=0)
attack.drop(range(k),axis=0,inplace=True)
attack = attack.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,attack],axis=0)
i+=attack.shape[0]
attack = pd.DataFrame()
all.to_csv('ton_iot/all.csv',index=False)
all = pd.DataFrame()
i = 0
while i < m:
print(i,"/",m)
if backdoor.shape[0] >0:
k = random.randint(1,10)
if(backdoor.shape[0] >k):
all = pd.concat([all,backdoor.iloc[:k,:]],axis=0)
backdoor.drop(range(k),axis=0,inplace=True)
backdoor = backdoor.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,backdoor],axis=0)
i+=backdoor.shape[0]
backdoor = pd.DataFrame()
if ddos.shape[0] > 0:
k = random.randint(1,10)
if(ddos.shape[0] >k):
all = pd.concat([all,ddos.iloc[:k,:]],axis=0)
ddos.drop(list(range(k)),axis=0,inplace=True)
ddos = ddos.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,ddos],axis=0)
i+=ddos.shape[0]
ddos = pd.DataFrame()
if dos.shape[0] > 0:
k = random.randint(1,10)
if(dos.shape[0] >k):
all = pd.concat([all,dos.iloc[:k,:]],axis=0)
dos.drop(list(range(k)),axis=0,inplace=True)
dos = dos.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,dos],axis=0)
i+=dos.shape[0]
dos = pd.DataFrame()
if mitm.shape[0] > 0:
k = random.randint(1,10)
if(mitm.shape[0] >k):
all = pd.concat([all,mitm.iloc[:k,:]],axis=0)
mitm.drop(list(range(k)),axis=0,inplace=True)
mitm = mitm.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,mitm],axis=0)
i+=mitm.shape[0]
mitm = pd.DataFrame()
if password.shape[0] > 0:
k = random.randint(1,10)
if(password.shape[0] >k):
all = pd.concat([all,password.iloc[:k,:]],axis=0)
password.drop(list(range(k)),axis=0,inplace=True)
password = password.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,password],axis=0)
i+=password.shape[0]
password = pd.DataFrame()
if ransomware.shape[0] > 0:
k = random.randint(1,10)
if(ransomware.shape[0] >k):
all = pd.concat([all,ransomware.iloc[:k,:]],axis=0)
ransomware.drop(list(range(k)),axis=0,inplace=True)
ransomware = ransomware.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,ransomware],axis=0)
i+=ransomware.shape[0]
ransomware = pd.DataFrame()
if scanning.shape[0] > 0:
k = random.randint(1,10)
if(scanning.shape[0] >k):
all = pd.concat([all,scanning.iloc[:k,:]],axis=0)
scanning.drop(list(range(k)),axis=0,inplace=True)
scanning = scanning.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,scanning],axis=0)
i+=scanning.shape[0]
scanning = pd.DataFrame()
if xss.shape[0] > 0:
k = random.randint(1,10)
if(xss.shape[0] >k):
all = pd.concat([all,xss.iloc[:k,:]],axis=0)
xss.drop(list(range(k)),axis=0,inplace=True)
xss = xss.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,xss],axis=0)
i+=xss.shape[0]
xss = pd.DataFrame()
if injection.shape[0] > 0:
k = random.randint(1,10)
if(injection.shape[0] >k):
all = pd.concat([all,injection.iloc[:k,:]],axis=0)
injection.drop(list(range(k)),axis=0,inplace=True)
injection = injection.reset_index(drop=True)
i+=k
else:
all = pd.concat([all,injection],axis=0)
i+=injection.shape[0]
injection = pd.DataFrame()
all.to_csv('ton_iot/attacks.csv',index=False)
\ No newline at end of file
# NOTE: same logic with group_attack.py file it randomly select raws from normal or attacks while keeping the order inside each class
import pandas as pd
import os
import numpy as np
# file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','all.csv')
file_normal = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','normal.csv')
file_attack = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','attack.csv')
for_snapshotting = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Ghypeddings','datasets','examples','CICDDoS2019','original','for_snapshotting.csv')
# df = pd.read_csv(file,low_memory=False)
# print('>> shape:',df.shape)
# df.dropna(axis=0, inplace=True)
# normal = df[df[' Label'] == 'BENIGN']
# print('>> normal shape:',normal.shape)
# normal.to_csv(file_normal,index=False)
# attack = df[df[' Label'] != 'BENIGN']
# print('>> attack shape:',attack.shape)
# smallest = min(attack.shape[0],normal.shape[0])
# if normal.shape[0] <= attack.shape[0]:
# normal = normal.sort_values(by=' Timestamp')
# normal.to_csv(file_normal)
# attack = attack.sample(n=normal.shape[0]).reset_index(drop=True)
# attack = attack.sort_values(by=' Timestamp')
# attack.to_csv(file_attack)
# else:
# attack = attack.sort_values(by=' Timestamp')
# attack.to_csv(file_attack)
# normal = normal.sample(n=attack.shape[0]).reset_index(drop=True)
# normal = normal.sort_values(by=' Timestamp')
# normal.to_csv(file_normal)
df = pd.DataFrame()
normal = pd.read_csv(file_normal,low_memory=False)
attack = pd.read_csv(file_attack,low_memory=False)
attack[' Label'] = 1
normal[' Label'] = 0
nnodes = normal.shape[0]
i,j=0,0
stop =False
while not stop:
if i < nnodes:
k = np.random.randint(1,20)
if i+k > nnodes:
k = nnodes - i
print('Normal: [{},{}]'.format(i,i+k))
df = pd.concat([df,normal.iloc[i:i+k,:]],ignore_index=True)
i+=k
if j < nnodes:
k = np.random.randint(1,20)
if j+k > nnodes:
k = nnodes - j
print('Attack: [{},{}]'.format(j,j+k))
df = pd.concat([df,attack.iloc[j:j+k,:]],ignore_index=True)
j+=k
if i == j == nnodes:
stop = True
df.to_csv(for_snapshotting,index=False)
# NOTE: This file generates the snapshots from the ton_iot dataset. It does everything starting by the cleaning and moving to data spliting.
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle
def _filling_adjacency_numpy(data):
N = data.shape[0]
try:
adjacency = np.zeros((N,N), dtype=bool)
except Exception as e:
print(f"An error occurred: {e}")
source_ips = data['Src IP'].to_numpy()
destination_ips = data['Dst IP'].to_numpy()
mask = ((source_ips[:, np.newaxis] == source_ips) | (source_ips[:, np.newaxis] == destination_ips) | (destination_ips[:, np.newaxis] == source_ips)| (destination_ips[:, np.newaxis] == destination_ips) )
adjacency[mask] = True
return adjacency
def save_samples(adj,features,labels,adj_path,features_path,labels_path):
with open(adj_path,'wb') as f:
pickle.dump(adj,f)
with open(features_path,'wb') as f:
pickle.dump(features,f)
with open(labels_path,'wb') as f:
pickle.dump(labels,f)
nnodes = 1000
overlap = 0.25
directory = os.path.join(os.getcwd(),f'ton_iot_snapshots_{int(overlap*100)}_{nnodes}')
os.makedirs(directory)
df = pd.read_csv('other/ton_iot/all.csv')
df.dropna(axis=0,inplace=True)
df = df.reset_index(drop=True)
columns_to_exclude = ['Src IP','Dst IP','Label']
columns_to_normalize = [x for x in df.columns if x not in columns_to_exclude]
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
i=0
j=0
while i< df.shape[0]:
print('snapshot:',j)
if df.shape[0] > nnodes:
data = df.iloc[:nnodes,:].copy()
adj = _filling_adjacency_numpy(data)
labels = data['Label'].to_numpy()
data.drop(columns_to_exclude, axis=1, inplace=True)
features = data.to_numpy()
save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
j+=1
i+=nnodes
df.drop(range(nnodes - int(nnodes*overlap)),axis=0,inplace=True)
df = df.reset_index(drop=True)
print(np.sum(labels),len(labels)-np.sum(labels))
else:
adj = _filling_adjacency_numpy(df)
labels = data['Label'].to_numpy()
data.drop(columns_to_exclude, axis=1, inplace=True)
features = data.to_numpy()
save_samples(adj,features,labels,os.path.join(directory,f'adjacency_{j}.pkl'),os.path.join(directory,f'features_{j}.pkl'),os.path.join(directory,f'labels_{j}.pkl'))
j+=1
i+=df.shape[0]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment