Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
HADAPT
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gladis
HADAPT
Commits
31acc200
Commit
31acc200
authored
1 year ago
by
Walid Megherbi
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
71a7e897
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
HADAPT.py
+221
-0
221 additions, 0 deletions
HADAPT.py
with
221 additions
and
0 deletions
HADAPT.py
0 → 100644
+
221
−
0
View file @
31acc200
import
random
import
numpy
as
np
from
sklearn.ensemble
import
IsolationForest
from
sklearn.metrics
import
roc_auc_score
,
balanced_accuracy_score
,
average_precision_score
import
pandas
as
pd
import
argparse
from
sklearn.metrics
import
roc_curve
from
sklearn.metrics
import
confusion_matrix
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
f1_score
import
csv
threshold
=
0
def
universal_hash
(
x
,
a
=
123456789
,
b
=
987654321
,
p
=
2
**
31
-
1
,
m
=
4
):
return
(
a
*
x
+
b
)
%
p
%
m
def
str_to_int
(
s
):
return
int
.
from_bytes
(
s
.
encode
(),
'
little
'
)
def
update_vector
(
v
,
s
,
hash_size
,
N
=
1
):
idx
=
universal_hash
(
str_to_int
(
s
[:
hash_size
]),
m
=
N
)
v
[
idx
]
+=
1
def
load_data
(
data_path
):
return
pd
.
read_csv
(
data_path
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
'
src_id
'
,
'
src_type
'
,
'
dst_id
'
,
'
dst_type
'
,
'
e_type
'
,
'
graph_id
'
])
def
get_train_test_ids
(
train_id_file
,
test_id_file
):
benign_ranges
=
[(
0
,
299
),
(
400
,
599
)]
anomalous_ranges
=
[(
300
,
399
),
(
600
,
699
)]
benign_graph_ids
=
[]
anomalous_graph_ids
=
[]
for
start
,
end
in
benign_ranges
:
benign_graph_ids
.
extend
(
range
(
start
,
end
+
1
))
for
start
,
end
in
anomalous_ranges
:
anomalous_graph_ids
.
extend
(
range
(
start
,
end
+
1
))
#train_id_file = "D:\\AD Survey\\datasets\\streamspot\\all1_train.txt"
#test_id_file ="D:\\AD Survey\\datasets\\streamspot\\all1.txt"
train_graph_ids
=
pd
.
read_csv
(
train_id_file
,
header
=
None
).
iloc
[:,
0
].
tolist
()
all_graph_ids
=
pd
.
read_csv
(
test_id_file
,
header
=
None
).
iloc
[:,
0
].
tolist
()
test_graph_ids
=
list
(
set
(
all_graph_ids
)
-
set
(
train_graph_ids
))
return
train_graph_ids
,
test_graph_ids
,
benign_graph_ids
,
anomalous_graph_ids
def
train
(
train_graph_ids
,
df
,
vector_size
,
string_size
):
train_vectors
=
np
.
zeros
((
len
(
train_graph_ids
),
vector_size
))
all_train_vector
=
np
.
array
([]).
reshape
(
0
,
vector_size
)
edge_count
=
0
edges_dict
=
{
gid
:
df
[
df
[
'
graph_id
'
]
==
gid
].
to_dict
(
'
records
'
)
for
gid
in
train_graph_ids
}
edge_offset
=
{
graph_id
:
0
for
graph_id
in
train_graph_ids
}
clfs
=
[]
prev_rows
=
{}
group_copy
=
train_graph_ids
.
copy
()
graph_edge_strings
=
{
gid
:
""
for
gid
in
train_graph_ids
}
while
group_copy
:
gid
=
random
.
choice
(
group_copy
)
row
=
edges_dict
[
gid
][
edge_offset
[
gid
]]
prev_row
=
prev_rows
.
get
(
gid
)
if
prev_row
is
None
:
edge_string
=
''
.
join
(
'
0
'
+
row
[
'
src_type
'
]
+
row
[
'
dst_type
'
]
+
row
[
'
e_type
'
])
else
:
temporal_encoding
=
0
if
row
[
'
src_id
'
]
!=
prev_row
[
'
src_id
'
]
and
row
[
'
dst_id
'
]
!=
prev_row
[
'
dst_id
'
]:
temporal_encoding
=
2
elif
row
[
'
src_id
'
]
!=
prev_row
[
'
src_id
'
]
or
row
[
'
dst_id
'
]
!=
prev_row
[
'
dst_id
'
]:
temporal_encoding
=
1
edge_string
=
str
(
temporal_encoding
)
+
''
.
join
(
row
[
'
src_type
'
]
+
row
[
'
dst_type
'
]
+
row
[
'
e_type
'
])
i
=
train_graph_ids
.
index
(
gid
)
graph_edge_strings
[
gid
]
+=
edge_string
if
len
(
graph_edge_strings
[
gid
])
>=
string_size
:
update_vector
(
train_vectors
[
i
],
graph_edge_strings
[
gid
][:
string_size
],
hash_size
=
string_size
,
N
=
vector_size
)
graph_edge_strings
[
gid
]
=
""
# Reset the string for the graph
#update_vector(train_vectors[i], edge_string)
edge_count
+=
1
prev_rows
[
gid
]
=
row
edge_offset
[
gid
]
+=
1
if
edge_offset
[
gid
]
>=
len
(
edges_dict
[
gid
]):
group_copy
.
remove
(
gid
)
all_train_vector
=
np
.
vstack
((
all_train_vector
,
train_vectors
))
#if edge_count % 3000 == 0:
#all_train_vector = np.vstack((all_train_vector, train_vectors))
all_train_vector
=
np
.
vstack
((
all_train_vector
,
train_vectors
))
train_vectors
,
val_vectors
=
train_test_split
(
train_vectors
,
test_size
=
0.2
,
random_state
=
42
)
# Initialize the IsolationForest model
clf
=
IsolationForest
(
contamination
=
0.2
)
# Fit the model to the training data
clf
.
fit
(
train_vectors
)
n_samples
=
val_vectors
.
shape
[
0
]
# Create an array of ones
y_true_val
=
np
.
ones
(
n_samples
,
dtype
=
int
)
# Get the anomaly scores on the validation data
best_threshold
=
0
best_f1
=
0
for
threshold
in
np
.
linspace
(
-
1.0
,
1.0
,
100
):
y_pred
=
(
clf
.
decision_function
(
val_vectors
)
>=
threshold
).
astype
(
int
)
current_f1
=
f1_score
(
y_true_val
,
y_pred
)
if
current_f1
>
best_f1
:
best_f1
=
current_f1
best_threshold
=
threshold
'''
anomaly_scores = clf.decision_function(val_vectors)
# Compute the threshold on the validation data
threshold = np.percentile(anomaly_scores, 1) # 1% quantile
'''
'''
clf = IsolationForest(contamination=0.002)
clf.fit(train_vectors)
anomaly_scores = clf.decision_function(all_train_vector)
threshold = np.percentile(anomaly_scores, 1) # 1% quantile
'''
return
clf
,
best_threshold
def
test
(
clf
,
test_graph_ids
,
df
,
benign_graph_ids
,
vector_size
,
string_size
,
threshold
):
test_vectors
=
np
.
zeros
((
len
(
test_graph_ids
),
vector_size
))
edge_count
=
0
results
=
[]
nb_vectors
=
0
edges_dict
=
{
gid
:
df
[
df
[
'
graph_id
'
]
==
gid
].
to_dict
(
'
records
'
)
for
gid
in
test_graph_ids
}
edge_offset
=
{
graph_id
:
0
for
graph_id
in
test_graph_ids
}
prev_rows
=
{}
group_copy
=
test_graph_ids
.
copy
()
graph_edge_strings
=
{
gid
:
""
for
gid
in
test_graph_ids
}
buffer_size
=
10000
# or whatever size you deem appropriate
data_buffer
=
[]
while
group_copy
:
gid
=
random
.
choice
(
group_copy
)
row
=
edges_dict
[
gid
][
edge_offset
[
gid
]]
prev_row
=
prev_rows
.
get
(
gid
)
if
prev_row
is
None
:
edge_string
=
''
.
join
(
'
0
'
+
row
[
'
src_type
'
]
+
row
[
'
dst_type
'
]
+
row
[
'
e_type
'
])
else
:
temporal_encoding
=
0
if
row
[
'
src_id
'
]
!=
prev_row
[
'
src_id
'
]
and
row
[
'
dst_id
'
]
!=
prev_row
[
'
dst_id
'
]:
temporal_encoding
=
2
elif
row
[
'
src_id
'
]
!=
prev_row
[
'
src_id
'
]
or
row
[
'
dst_id
'
]
!=
prev_row
[
'
dst_id
'
]:
temporal_encoding
=
1
edge_string
=
str
(
temporal_encoding
)
+
''
.
join
(
row
[
'
src_type
'
]
+
row
[
'
dst_type
'
]
+
row
[
'
e_type
'
])
graph_edge_strings
[
gid
]
+=
edge_string
i
=
test_graph_ids
.
index
(
gid
)
if
len
(
graph_edge_strings
[
gid
])
>=
string_size
:
update_vector
(
test_vectors
[
i
],
graph_edge_strings
[
gid
][:
string_size
],
hash_size
=
string_size
,
N
=
vector_size
)
graph_edge_strings
[
gid
]
=
""
# Reset the string for the graph
#update_vector(test_vectors[i], edge_string)
edge_count
+=
1
prev_rows
[
gid
]
=
row
edge_offset
[
gid
]
+=
1
if
edge_offset
[
gid
]
>=
len
(
edges_dict
[
gid
]):
group_copy
.
remove
(
gid
)
if
edge_count
%
10000
==
0
:
anomalous_scores
=
-
clf
.
decision_function
(
test_vectors
)
true_labels
=
[
0
if
graph_id
in
benign_graph_ids
else
1
for
graph_id
in
test_graph_ids
]
auc_score
=
roc_auc_score
(
true_labels
,
anomalous_scores
)
optimal_threshold
=
threshold
binary_predictions
=
[
1
if
score
>=
optimal_threshold
else
0
for
score
in
anomalous_scores
]
balanced_acc
=
balanced_accuracy_score
(
true_labels
,
binary_predictions
)
avg_precision
=
average_precision_score
(
true_labels
,
anomalous_scores
)
# Using the optimal threshold to classify the scores
predicted_labels
=
(
anomalous_scores
>=
optimal_threshold
).
astype
(
int
)
# Compute the confusion matrix
tn
,
fp
,
fn
,
tp
=
confusion_matrix
(
true_labels
,
predicted_labels
).
ravel
()
# Compute FPR and FNR
fpr_optimal
=
fp
/
(
fp
+
tn
)
fnr_optimal
=
fn
/
(
fn
+
tp
)
print
(
auc_score
,
balanced_acc
,
avg_precision
,
fpr_optimal
,
fnr_optimal
)
#print(auc_score,fpr_optimal,fnr_optimal,balanced_acc,avg_precision,pr)
results
.
append
((
auc_score
,
balanced_acc
,
avg_precision
,
fpr_optimal
,
fnr_optimal
))
return
results
def
main
(
data_path
,
vector_size
,
train_path
,
test_path
,
output
,
string_size
):
df
=
load_data
(
data_path
)
train_graph_ids
,
test_graph_ids
,
benign_graph_ids
,
anomalous_graph_ids
=
get_train_test_ids
(
train_path
,
test_path
)
clf
,
th
=
train
(
train_graph_ids
,
df
,
vector_size
,
string_size
)
results
=
test
(
clf
,
test_graph_ids
,
df
,
benign_graph_ids
,
vector_size
,
string_size
,
th
)
with
open
(
output
,
"
w
"
,
newline
=
''
)
as
file
:
writer
=
csv
.
writer
(
file
,
delimiter
=
'
;
'
)
writer
.
writerow
([
"
AUC
"
,
"
Balanced Accuracy
"
,
"
Average Precision
"
])
# Writing the header
for
auc_score
,
balanced_acc
,
avg_precision
,
fpr_optimal
,
fnr_optimal
in
results
:
writer
.
writerow
([
auc_score
,
balanced_acc
,
avg_precision
,
fpr_optimal
,
fnr_optimal
])
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"
Streamspot Anomaly Detection
"
)
parser
.
add_argument
(
'
--data_path
'
,
type
=
str
,
required
=
True
,
help
=
"
Path to the dataset
"
)
parser
.
add_argument
(
'
--train_ids
'
,
type
=
str
,
required
=
True
,
help
=
"
Path to the train graph ids
"
)
parser
.
add_argument
(
'
--test_ids
'
,
type
=
str
,
required
=
True
,
help
=
"
Path to the test graph ids
"
)
parser
.
add_argument
(
'
--vector_size
'
,
type
=
int
,
default
=
2
**
7
,
help
=
"
Size of the hash vector
"
)
parser
.
add_argument
(
'
--string_size
'
,
type
=
int
,
default
=
4
,
help
=
"
Size of the hash vector
"
)
parser
.
add_argument
(
'
--output
'
,
type
=
str
,
required
=
True
,
help
=
"
Path to the results
"
)
args
=
parser
.
parse_args
()
main
(
args
.
data_path
,
args
.
vector_size
,
args
.
train_ids
,
args
.
test_ids
,
args
.
output
,
args
.
string_size
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment