-
Walid Megherbi authored31acc200
HADAPT.py 9.30 KiB
import random
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, average_precision_score
import pandas as pd
import argparse
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import csv
threshold = 0
def universal_hash(x, a=123456789, b=987654321, p=2**31-1, m= 4):
return (a * x + b) % p % m
def str_to_int(s):
return int.from_bytes(s.encode(), 'little')
def update_vector(v, s,hash_size ,N=1):
idx = universal_hash(str_to_int(s[:hash_size]),m=N)
v[idx] += 1
def load_data(data_path):
return pd.read_csv(data_path, sep='\t', header=None, names=['src_id', 'src_type', 'dst_id', 'dst_type', 'e_type', 'graph_id'])
def get_train_test_ids(train_id_file,test_id_file):
benign_ranges = [(0, 299), (400, 599)]
anomalous_ranges = [(300, 399), (600, 699)]
benign_graph_ids = []
anomalous_graph_ids = []
for start, end in benign_ranges:
benign_graph_ids.extend(range(start, end+1))
for start, end in anomalous_ranges:
anomalous_graph_ids.extend(range(start, end+1))
#train_id_file = "D:\\AD Survey\\datasets\\streamspot\\all1_train.txt"
#test_id_file ="D:\\AD Survey\\datasets\\streamspot\\all1.txt"
train_graph_ids = pd.read_csv(train_id_file, header=None).iloc[:, 0].tolist()
all_graph_ids = pd.read_csv(test_id_file, header=None).iloc[:, 0].tolist()
test_graph_ids = list(set(all_graph_ids) - set(train_graph_ids))
return train_graph_ids, test_graph_ids, benign_graph_ids, anomalous_graph_ids
def train(train_graph_ids, df, vector_size,string_size):
train_vectors = np.zeros((len(train_graph_ids), vector_size))
all_train_vector = np.array([]).reshape(0,vector_size)
edge_count = 0
edges_dict = {gid: df[df['graph_id'] == gid].to_dict('records') for gid in train_graph_ids}
edge_offset = {graph_id: 0 for graph_id in train_graph_ids}
clfs = []
prev_rows = {}
group_copy = train_graph_ids.copy()
graph_edge_strings = {gid: "" for gid in train_graph_ids}
while group_copy:
gid = random.choice(group_copy)
row = edges_dict[gid][edge_offset[gid]]
prev_row = prev_rows.get(gid)
if prev_row is None:
edge_string = ''.join('0'+row['src_type'] + row['dst_type'] + row['e_type'])
else:
temporal_encoding = 0
if row['src_id'] != prev_row['src_id'] and row['dst_id'] != prev_row['dst_id']:
temporal_encoding = 2
elif row['src_id'] != prev_row['src_id'] or row['dst_id'] != prev_row['dst_id']:
temporal_encoding = 1
edge_string = str(temporal_encoding) + ''.join(row['src_type'] + row['dst_type'] + row['e_type'])
i = train_graph_ids.index(gid)
graph_edge_strings[gid] += edge_string
if len(graph_edge_strings[gid]) >= string_size:
update_vector(train_vectors[i], graph_edge_strings[gid][:string_size],hash_size=string_size,N=vector_size)
graph_edge_strings[gid] = "" # Reset the string for the graph
#update_vector(train_vectors[i], edge_string)
edge_count += 1
prev_rows[gid] = row
edge_offset[gid] += 1
if edge_offset[gid] >= len(edges_dict[gid]):
group_copy.remove(gid)
all_train_vector = np.vstack((all_train_vector, train_vectors))
#if edge_count % 3000 == 0:
#all_train_vector = np.vstack((all_train_vector, train_vectors))
all_train_vector = np.vstack((all_train_vector, train_vectors))
train_vectors, val_vectors = train_test_split(train_vectors, test_size=0.2, random_state=42)
# Initialize the IsolationForest model
clf = IsolationForest(contamination=0.2)
# Fit the model to the training data
clf.fit(train_vectors)
n_samples = val_vectors.shape[0]
# Create an array of ones
y_true_val = np.ones(n_samples, dtype=int)
# Get the anomaly scores on the validation data
best_threshold = 0
best_f1 = 0
for threshold in np.linspace(-1.0, 1.0, 100):
y_pred = (clf.decision_function(val_vectors) >= threshold).astype(int)
current_f1 = f1_score(y_true_val, y_pred)
if current_f1 > best_f1:
best_f1 = current_f1
best_threshold = threshold
'''anomaly_scores = clf.decision_function(val_vectors)
# Compute the threshold on the validation data
threshold = np.percentile(anomaly_scores, 1) # 1% quantile'''
'''clf = IsolationForest(contamination=0.002)
clf.fit(train_vectors)
anomaly_scores = clf.decision_function(all_train_vector)
threshold = np.percentile(anomaly_scores, 1) # 1% quantile'''
return clf,best_threshold
def test(clf, test_graph_ids, df, benign_graph_ids, vector_size,string_size,threshold):
test_vectors = np.zeros((len(test_graph_ids), vector_size))
edge_count = 0
results = []
nb_vectors = 0
edges_dict = {gid: df[df['graph_id'] == gid].to_dict('records') for gid in test_graph_ids}
edge_offset = {graph_id: 0 for graph_id in test_graph_ids}
prev_rows = {}
group_copy = test_graph_ids.copy()
graph_edge_strings = {gid: "" for gid in test_graph_ids}
buffer_size = 10000 # or whatever size you deem appropriate
data_buffer = []
while group_copy:
gid = random.choice(group_copy)
row = edges_dict[gid][edge_offset[gid]]
prev_row = prev_rows.get(gid)
if prev_row is None:
edge_string = ''.join('0'+row['src_type'] + row['dst_type'] + row['e_type'])
else:
temporal_encoding = 0
if row['src_id'] != prev_row['src_id'] and row['dst_id'] != prev_row['dst_id']:
temporal_encoding = 2
elif row['src_id'] != prev_row['src_id'] or row['dst_id'] != prev_row['dst_id']:
temporal_encoding = 1
edge_string = str(temporal_encoding) + ''.join(row['src_type'] + row['dst_type'] + row['e_type'])
graph_edge_strings[gid] += edge_string
i = test_graph_ids.index(gid)
if len(graph_edge_strings[gid]) >= string_size:
update_vector(test_vectors[i], graph_edge_strings[gid][:string_size],hash_size=string_size,N=vector_size)
graph_edge_strings[gid] = "" # Reset the string for the graph
#update_vector(test_vectors[i], edge_string)
edge_count += 1
prev_rows[gid] = row
edge_offset[gid] += 1
if edge_offset[gid] >= len(edges_dict[gid]):
group_copy.remove(gid)
if edge_count % 10000 == 0:
anomalous_scores = -clf.decision_function(test_vectors)
true_labels = [0 if graph_id in benign_graph_ids else 1 for graph_id in test_graph_ids]
auc_score = roc_auc_score(true_labels, anomalous_scores)
optimal_threshold = threshold
binary_predictions = [1 if score >= optimal_threshold else 0 for score in anomalous_scores]
balanced_acc = balanced_accuracy_score(true_labels, binary_predictions)
avg_precision = average_precision_score(true_labels, anomalous_scores)
# Using the optimal threshold to classify the scores
predicted_labels = (anomalous_scores >= optimal_threshold).astype(int)
# Compute the confusion matrix
tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()
# Compute FPR and FNR
fpr_optimal = fp / (fp + tn)
fnr_optimal = fn / (fn + tp)
print(auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal)
#print(auc_score,fpr_optimal,fnr_optimal,balanced_acc,avg_precision,pr)
results.append((auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal))
return results
def main(data_path,vector_size,train_path,test_path, output,string_size):
df = load_data(data_path)
train_graph_ids, test_graph_ids, benign_graph_ids, anomalous_graph_ids = get_train_test_ids(train_path,test_path)
clf,th = train(train_graph_ids, df, vector_size,string_size)
results = test(clf, test_graph_ids, df, benign_graph_ids, vector_size,string_size,th)
with open(output, "w", newline='') as file:
writer = csv.writer(file,delimiter=';')
writer.writerow(["AUC", "Balanced Accuracy", "Average Precision"]) # Writing the header
for auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal in results:
writer.writerow([auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Streamspot Anomaly Detection")
parser.add_argument('--data_path', type=str, required=True, help="Path to the dataset")
parser.add_argument('--train_ids', type=str, required=True, help="Path to the train graph ids")
parser.add_argument('--test_ids', type=str, required=True, help="Path to the test graph ids")
parser.add_argument('--vector_size', type=int, default=2**7, help="Size of the hash vector")
parser.add_argument('--string_size', type=int, default=4, help="Size of the hash vector")
parser.add_argument('--output', type=str, required=True, help="Path to the results")
args = parser.parse_args()
main(args.data_path, args.vector_size,args.train_ids,args.test_ids,args.output,args.string_size)