HADAPT.py

import random
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, average_precision_score
import pandas as pd
import argparse
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


import csv

threshold = 0


def universal_hash(x, a=123456789, b=987654321, p=2**31-1, m= 4):
    return (a * x + b) % p % m

def str_to_int(s):
    return int.from_bytes(s.encode(), 'little')

def update_vector(v, s,hash_size ,N=1):

    idx = universal_hash(str_to_int(s[:hash_size]),m=N)
    v[idx] += 1

def load_data(data_path):
    return pd.read_csv(data_path, sep='\t', header=None, names=['src_id', 'src_type', 'dst_id', 'dst_type', 'e_type', 'graph_id'])

def get_train_test_ids(train_id_file,test_id_file):
    benign_ranges = [(0, 299), (400, 599)]
    anomalous_ranges = [(300, 399), (600, 699)]

    benign_graph_ids = []
    anomalous_graph_ids = []

    for start, end in benign_ranges:
        benign_graph_ids.extend(range(start, end+1))

    for start, end in anomalous_ranges:
        anomalous_graph_ids.extend(range(start, end+1))

    #train_id_file = "D:\\AD Survey\\datasets\\streamspot\\all1_train.txt"
    #test_id_file ="D:\\AD Survey\\datasets\\streamspot\\all1.txt"

    train_graph_ids = pd.read_csv(train_id_file, header=None).iloc[:, 0].tolist()
    all_graph_ids = pd.read_csv(test_id_file, header=None).iloc[:, 0].tolist()

    test_graph_ids = list(set(all_graph_ids) - set(train_graph_ids))

    return train_graph_ids, test_graph_ids, benign_graph_ids, anomalous_graph_ids

def train(train_graph_ids, df, vector_size,string_size):
    train_vectors = np.zeros((len(train_graph_ids), vector_size))
    all_train_vector = np.array([]).reshape(0,vector_size)

    edge_count = 0

    edges_dict = {gid: df[df['graph_id'] == gid].to_dict('records') for gid in train_graph_ids}
    edge_offset = {graph_id: 0 for graph_id in train_graph_ids}
    clfs = []
    prev_rows = {}
    group_copy = train_graph_ids.copy()
    graph_edge_strings = {gid: "" for gid in train_graph_ids}
    while group_copy:
        gid = random.choice(group_copy)
        row = edges_dict[gid][edge_offset[gid]]
        prev_row = prev_rows.get(gid)
        if prev_row is None:
            edge_string = ''.join('0'+row['src_type'] + row['dst_type'] + row['e_type'])
        else:
            temporal_encoding = 0
            if row['src_id'] != prev_row['src_id'] and row['dst_id'] != prev_row['dst_id']:
                temporal_encoding = 2
            elif row['src_id'] != prev_row['src_id'] or row['dst_id'] != prev_row['dst_id']:
                temporal_encoding = 1
            edge_string = str(temporal_encoding) + ''.join(row['src_type'] + row['dst_type'] + row['e_type'])

        i = train_graph_ids.index(gid)
        graph_edge_strings[gid] += edge_string
        if len(graph_edge_strings[gid]) >= string_size:
            update_vector(train_vectors[i], graph_edge_strings[gid][:string_size],hash_size=string_size,N=vector_size)
            graph_edge_strings[gid] = ""  # Reset the string for the graph
        #update_vector(train_vectors[i], edge_string)
        edge_count += 1
        prev_rows[gid] = row
        edge_offset[gid] += 1
        if edge_offset[gid] >= len(edges_dict[gid]):
            group_copy.remove(gid)
            all_train_vector = np.vstack((all_train_vector, train_vectors))
        #if edge_count % 3000 == 0:
            #all_train_vector = np.vstack((all_train_vector, train_vectors))

    all_train_vector = np.vstack((all_train_vector, train_vectors))
    train_vectors, val_vectors = train_test_split(train_vectors, test_size=0.2, random_state=42)

    # Initialize the IsolationForest model
    clf = IsolationForest(contamination=0.2)

    # Fit the model to the training data
    clf.fit(train_vectors)
    n_samples = val_vectors.shape[0]

    # Create an array of ones
    y_true_val = np.ones(n_samples, dtype=int)
    # Get the anomaly scores on the validation data
    best_threshold = 0
    best_f1 = 0
    for threshold in np.linspace(-1.0, 1.0, 100):
        y_pred = (clf.decision_function(val_vectors) >= threshold).astype(int)
        current_f1 = f1_score(y_true_val, y_pred)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold


    '''anomaly_scores = clf.decision_function(val_vectors)

    # Compute the threshold on the validation data
    threshold = np.percentile(anomaly_scores, 1)  # 1% quantile'''


    '''clf = IsolationForest(contamination=0.002)
    clf.fit(train_vectors)


    anomaly_scores = clf.decision_function(all_train_vector)
    threshold = np.percentile(anomaly_scores, 1)  # 1% quantile'''
    return clf,best_threshold

def test(clf, test_graph_ids, df, benign_graph_ids, vector_size,string_size,threshold):
    test_vectors = np.zeros((len(test_graph_ids), vector_size))
    edge_count = 0
    results = []
    nb_vectors = 0

    edges_dict = {gid: df[df['graph_id'] == gid].to_dict('records') for gid in test_graph_ids}
    edge_offset = {graph_id: 0 for graph_id in test_graph_ids}

    prev_rows = {}
    group_copy = test_graph_ids.copy()
    graph_edge_strings = {gid: "" for gid in test_graph_ids}
    buffer_size = 10000  # or whatever size you deem appropriate
    data_buffer = []
    while group_copy:
        gid = random.choice(group_copy)
        row = edges_dict[gid][edge_offset[gid]]
        prev_row = prev_rows.get(gid)
        if prev_row is None:
            edge_string = ''.join('0'+row['src_type'] + row['dst_type'] + row['e_type'])
        else:
            temporal_encoding = 0
            if row['src_id'] != prev_row['src_id'] and row['dst_id'] != prev_row['dst_id']:
                temporal_encoding = 2
            elif row['src_id'] != prev_row['src_id'] or row['dst_id'] != prev_row['dst_id']:
                temporal_encoding = 1
            edge_string = str(temporal_encoding) + ''.join(row['src_type'] + row['dst_type'] + row['e_type'])

        graph_edge_strings[gid] += edge_string
        i = test_graph_ids.index(gid)
        if len(graph_edge_strings[gid]) >= string_size:
            update_vector(test_vectors[i], graph_edge_strings[gid][:string_size],hash_size=string_size,N=vector_size)
            graph_edge_strings[gid] = ""  # Reset the string for the graph
        #update_vector(test_vectors[i], edge_string)
        edge_count += 1
        prev_rows[gid] = row
        edge_offset[gid] += 1
        if edge_offset[gid] >= len(edges_dict[gid]):
            group_copy.remove(gid)

        if edge_count % 10000 == 0:
            anomalous_scores = -clf.decision_function(test_vectors)
            true_labels = [0 if graph_id in benign_graph_ids else 1 for graph_id in test_graph_ids]
            auc_score = roc_auc_score(true_labels, anomalous_scores)
            optimal_threshold = threshold
            binary_predictions = [1 if score >= optimal_threshold else 0 for score in anomalous_scores]
            balanced_acc = balanced_accuracy_score(true_labels, binary_predictions)
            avg_precision = average_precision_score(true_labels, anomalous_scores)
            # Using the optimal threshold to classify the scores
            predicted_labels = (anomalous_scores >= optimal_threshold).astype(int)
            # Compute the confusion matrix
            tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()
            # Compute FPR and FNR
            fpr_optimal = fp / (fp + tn)
            fnr_optimal = fn / (fn + tp)

            print(auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal)
            #print(auc_score,fpr_optimal,fnr_optimal,balanced_acc,avg_precision,pr)
            results.append((auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal))

    return results

def main(data_path,vector_size,train_path,test_path, output,string_size):
    df = load_data(data_path)
    train_graph_ids, test_graph_ids, benign_graph_ids, anomalous_graph_ids = get_train_test_ids(train_path,test_path)
    clf,th = train(train_graph_ids, df, vector_size,string_size)
    results = test(clf, test_graph_ids, df, benign_graph_ids, vector_size,string_size,th)

    with open(output, "w", newline='') as file:
        writer = csv.writer(file,delimiter=';')
        writer.writerow(["AUC", "Balanced Accuracy", "Average Precision"])  # Writing the header
        for auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal in results:
            writer.writerow([auc_score, balanced_acc, avg_precision,fpr_optimal,fnr_optimal])
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Streamspot Anomaly Detection")
    parser.add_argument('--data_path', type=str, required=True, help="Path to the dataset")
    parser.add_argument('--train_ids', type=str, required=True, help="Path to the train graph ids")
    parser.add_argument('--test_ids', type=str, required=True, help="Path to the test graph ids")
    parser.add_argument('--vector_size', type=int, default=2**7, help="Size of the hash vector")
    parser.add_argument('--string_size', type=int, default=4, help="Size of the hash vector")
    parser.add_argument('--output', type=str, required=True, help="Path to the results")


    args = parser.parse_args()

    main(args.data_path, args.vector_size,args.train_ids,args.test_ids,args.output,args.string_size)