Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
Fedhe-graph
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gladis
Fedhe-graph
Commits
bb4498d9
Commit
bb4498d9
authored
10 months ago
by
Athmane Mansour Bahar
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
c8d80239
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
utils/trace_parser.py
+288
-0
288 additions, 0 deletions
utils/trace_parser.py
with
288 additions
and
0 deletions
utils/trace_parser.py
0 → 100644
+
288
−
0
View file @
bb4498d9
import
argparse
import
json
import
os
import
random
import
re
from
tqdm
import
tqdm
import
networkx
as
nx
import
pickle
as
pkl
node_type_dict
=
{}
edge_type_dict
=
{}
node_type_cnt
=
0
edge_type_cnt
=
0
metadata
=
{
'
trace
'
:{
'
train
'
:
[
'
ta1-trace-e3-official-1.json.0
'
,
'
ta1-trace-e3-official-1.json.1
'
,
'
ta1-trace-e3-official-1.json.2
'
,
'
ta1-trace-e3-official-1.json.3
'
],
'
test
'
:
[
'
ta1-trace-e3-official-1.json.0
'
,
'
ta1-trace-e3-official-1.json.1
'
,
'
ta1-trace-e3-official-1.json.2
'
,
'
ta1-trace-e3-official-1.json.3
'
,
'
ta1-trace-e3-official-1.json.4
'
]
},
'
theia
'
:{
'
train
'
:
[
'
ta1-theia-e3-official-6r.json
'
,
'
ta1-theia-e3-official-6r.json.1
'
,
'
ta1-theia-e3-official-6r.json.2
'
,
'
ta1-theia-e3-official-6r.json.3
'
],
'
test
'
:
[
'
ta1-theia-e3-official-6r.json.8
'
]
},
'
cadets
'
:{
'
train
'
:
[
'
ta1-cadets-e3-official.json
'
,
'
ta1-cadets-e3-official.json.1
'
,
'
ta1-cadets-e3-official.json.2
'
,
'
ta1-cadets-e3-official-2.json.1
'
],
'
test
'
:
[
'
ta1-cadets-e3-official-2.json
'
]
}
}
pattern_uuid
=
re
.
compile
(
r
'
uuid\":\"(.*?)\"
'
)
pattern_src
=
re
.
compile
(
r
'
subject\":{\"com.bbn.tc.schema.avro.cdm18.UUID\":\"(.*?)\"}
'
)
pattern_dst1
=
re
.
compile
(
r
'
predicateObject\":{\"com.bbn.tc.schema.avro.cdm18.UUID\":\"(.*?)\"}
'
)
pattern_dst2
=
re
.
compile
(
r
'
predicateObject2\":{\"com.bbn.tc.schema.avro.cdm18.UUID\":\"(.*?)\"}
'
)
pattern_type
=
re
.
compile
(
r
'
type\":\"(.*?)\"
'
)
pattern_time
=
re
.
compile
(
r
'
timestampNanos\":(.*?),
'
)
pattern_file_name
=
re
.
compile
(
r
'
map\":\{\"path\":\"(.*?)\"
'
)
pattern_process_name
=
re
.
compile
(
r
'
map\":\{\"name\":\"(.*?)\"
'
)
pattern_netflow_object_name
=
re
.
compile
(
r
'
remoteAddress\":\"(.*?)\"
'
)
adversarial
=
'
None
'
def
read_single_graph
(
dataset
,
malicious
,
path
,
test
=
False
):
global
node_type_cnt
,
edge_type_cnt
g
=
nx
.
DiGraph
()
print
(
'
converting {} ...
'
.
format
(
path
))
path
=
'
../data/{}/
'
.
format
(
dataset
)
+
path
+
'
.txt
'
f
=
open
(
path
,
'
r
'
)
lines
=
[]
for
l
in
f
.
readlines
():
split_line
=
l
.
split
(
'
\t
'
)
src
,
src_type
,
dst
,
dst_type
,
edge_type
,
ts
=
split_line
ts
=
int
(
ts
)
if
not
test
:
if
src
in
malicious
or
dst
in
malicious
:
if
src
in
malicious
and
src_type
!=
'
MemoryObject
'
:
continue
if
dst
in
malicious
and
dst_type
!=
'
MemoryObject
'
:
continue
if
src_type
not
in
node_type_dict
:
node_type_dict
[
src_type
]
=
node_type_cnt
node_type_cnt
+=
1
if
dst_type
not
in
node_type_dict
:
node_type_dict
[
dst_type
]
=
node_type_cnt
node_type_cnt
+=
1
if
edge_type
not
in
edge_type_dict
:
edge_type_dict
[
edge_type
]
=
edge_type_cnt
edge_type_cnt
+=
1
if
'
READ
'
in
edge_type
or
'
RECV
'
in
edge_type
or
'
LOAD
'
in
edge_type
:
lines
.
append
([
dst
,
src
,
dst_type
,
src_type
,
edge_type
,
ts
])
else
:
lines
.
append
([
src
,
dst
,
src_type
,
dst_type
,
edge_type
,
ts
])
lines
.
sort
(
key
=
lambda
l
:
l
[
5
])
node_map
=
{}
node_type_map
=
{}
node_cnt
=
0
node_list
=
[]
for
l
in
lines
:
src
,
dst
,
src_type
,
dst_type
,
edge_type
=
l
[:
5
]
src_type_id
=
node_type_dict
[
src_type
]
dst_type_id
=
node_type_dict
[
dst_type
]
edge_type_id
=
edge_type_dict
[
edge_type
]
if
src
not
in
node_map
:
if
test
:
if
adversarial
==
'
MFE
'
or
adversarial
==
'
MCE
'
:
if
src
in
malicious
:
src_type_id
=
node_type_dict
[
'
FILE_OBJECT_FILE
'
]
if
not
test
:
if
adversarial
==
'
BFP
'
:
if
src
not
in
malicious
:
i
=
random
.
randint
(
1
,
20
)
if
i
==
1
:
src_type_id
=
node_type_dict
[
'
NetFlowObject
'
]
node_map
[
src
]
=
node_cnt
g
.
add_node
(
node_cnt
,
type
=
src_type_id
)
node_list
.
append
(
src
)
node_type_map
[
src
]
=
src_type
node_cnt
+=
1
if
dst
not
in
node_map
:
if
test
:
if
adversarial
==
'
MFE
'
or
adversarial
==
'
MCE
'
:
if
dst
in
malicious
:
dst_type_id
=
node_type_dict
[
'
FILE_OBJECT_FILE
'
]
if
not
test
:
if
adversarial
==
'
BFP
'
:
if
dst
not
in
malicious
:
i
=
random
.
randint
(
1
,
20
)
if
i
==
1
:
dst_type_id
=
node_type_dict
[
'
NetFlowObject
'
]
node_map
[
dst
]
=
node_cnt
g
.
add_node
(
node_cnt
,
type
=
dst_type_id
)
node_type_map
[
dst
]
=
dst_type
node_list
.
append
(
dst
)
node_cnt
+=
1
if
not
g
.
has_edge
(
node_map
[
src
],
node_map
[
dst
]):
g
.
add_edge
(
node_map
[
src
],
node_map
[
dst
],
type
=
edge_type_id
)
if
(
adversarial
==
'
MSE
'
or
adversarial
==
'
MCE
'
)
and
test
:
for
i
,
node
in
enumerate
(
node_list
):
if
node
in
malicious
:
while
True
:
another_node
=
random
.
choice
(
node_list
)
if
'
FILE_OBJECT
'
in
node_type_map
[
node
]
and
node_type_map
[
another_node
]
==
'
SUBJECT_PROCESS
'
:
g
.
add_edge
(
node_map
[
another_node
],
node_map
[
node
],
type
=
edge_type_dict
[
'
EVENT_WRITE
'
])
print
(
node
,
another_node
)
break
if
node_type_map
[
node
]
==
'
SUBJECT_PROCESS
'
and
node_type_map
[
another_node
]
==
'
FILE_OBJECT_FILE
'
:
g
.
add_edge
(
node_map
[
another_node
],
node_map
[
node
],
type
=
edge_type_dict
[
'
EVENT_READ
'
])
print
(
node
,
another_node
)
break
if
node_type_map
[
node
]
==
'
NetFlowObject
'
and
node_type_map
[
another_node
]
==
'
SUBJECT_PROCESS
'
:
g
.
add_edge
(
node_map
[
another_node
],
node_map
[
node
],
type
=
edge_type_dict
[
'
EVENT_CONNECT
'
])
print
(
node
,
another_node
)
break
if
not
'
FILE_OBJECT
'
in
node_type_map
[
node
]
and
not
node_type_map
[
node
]
==
'
SUBJECT_PROCESS
'
and
not
node_type_map
[
node
]
==
'
NetFlowObject
'
:
break
return
node_map
,
g
def
preprocess_dataset
(
dataset
):
id_nodetype_map
=
{}
id_nodename_map
=
{}
for
file
in
os
.
listdir
(
'
../data/{}/
'
.
format
(
dataset
)):
if
'
json
'
in
file
and
not
'
.txt
'
in
file
and
not
'
names
'
in
file
and
not
'
types
'
in
file
and
not
'
metadata
'
in
file
:
print
(
'
reading {} ...
'
.
format
(
file
))
f
=
open
(
'
../data/{}/
'
.
format
(
dataset
)
+
file
,
'
r
'
,
encoding
=
'
utf-8
'
)
for
line
in
tqdm
(
f
):
if
'
com.bbn.tc.schema.avro.cdm18.Event
'
in
line
or
'
com.bbn.tc.schema.avro.cdm18.Host
'
in
line
:
continue
if
'
com.bbn.tc.schema.avro.cdm18.TimeMarker
'
in
line
or
'
com.bbn.tc.schema.avro.cdm18.StartMarker
'
in
line
:
continue
if
'
com.bbn.tc.schema.avro.cdm18.UnitDependency
'
in
line
or
'
com.bbn.tc.schema.avro.cdm18.EndMarker
'
in
line
:
continue
if
len
(
pattern_uuid
.
findall
(
line
))
==
0
:
print
(
line
)
uuid
=
pattern_uuid
.
findall
(
line
)[
0
]
subject_type
=
pattern_type
.
findall
(
line
)
if
len
(
subject_type
)
<
1
:
if
'
com.bbn.tc.schema.avro.cdm18.MemoryObject
'
in
line
:
subject_type
=
'
MemoryObject
'
if
'
com.bbn.tc.schema.avro.cdm18.NetFlowObject
'
in
line
:
subject_type
=
'
NetFlowObject
'
if
'
com.bbn.tc.schema.avro.cdm18.UnnamedPipeObject
'
in
line
:
subject_type
=
'
UnnamedPipeObject
'
else
:
subject_type
=
subject_type
[
0
]
if
uuid
==
'
00000000-0000-0000-0000-000000000000
'
or
subject_type
in
[
'
SUBJECT_UNIT
'
]:
continue
id_nodetype_map
[
uuid
]
=
subject_type
if
'
FILE
'
in
subject_type
and
len
(
pattern_file_name
.
findall
(
line
))
>
0
:
id_nodename_map
[
uuid
]
=
pattern_file_name
.
findall
(
line
)[
0
]
elif
subject_type
==
'
SUBJECT_PROCESS
'
and
len
(
pattern_process_name
.
findall
(
line
))
>
0
:
id_nodename_map
[
uuid
]
=
pattern_process_name
.
findall
(
line
)[
0
]
elif
subject_type
==
'
NetFlowObject
'
and
len
(
pattern_netflow_object_name
.
findall
(
line
))
>
0
:
id_nodename_map
[
uuid
]
=
pattern_netflow_object_name
.
findall
(
line
)[
0
]
for
key
in
metadata
[
dataset
]:
for
file
in
metadata
[
dataset
][
key
]:
if
os
.
path
.
exists
(
'
../data/{}/
'
.
format
(
dataset
)
+
file
+
'
.txt
'
):
continue
f
=
open
(
'
../data/{}/
'
.
format
(
dataset
)
+
file
,
'
r
'
,
encoding
=
'
utf-8
'
)
fw
=
open
(
'
../data/{}/
'
.
format
(
dataset
)
+
file
+
'
.txt
'
,
'
w
'
,
encoding
=
'
utf-8
'
)
print
(
'
processing {} ...
'
.
format
(
file
))
for
line
in
tqdm
(
f
):
if
'
com.bbn.tc.schema.avro.cdm18.Event
'
in
line
:
edgeType
=
pattern_type
.
findall
(
line
)[
0
]
timestamp
=
pattern_time
.
findall
(
line
)[
0
]
srcId
=
pattern_src
.
findall
(
line
)
if
len
(
srcId
)
==
0
:
continue
srcId
=
srcId
[
0
]
if
not
srcId
in
id_nodetype_map
:
continue
srcType
=
id_nodetype_map
[
srcId
]
dstId1
=
pattern_dst1
.
findall
(
line
)
if
len
(
dstId1
)
>
0
and
dstId1
[
0
]
!=
'
null
'
:
dstId1
=
dstId1
[
0
]
if
not
dstId1
in
id_nodetype_map
:
continue
dstType1
=
id_nodetype_map
[
dstId1
]
this_edge1
=
str
(
srcId
)
+
'
\t
'
+
str
(
srcType
)
+
'
\t
'
+
str
(
dstId1
)
+
'
\t
'
+
str
(
dstType1
)
+
'
\t
'
+
str
(
edgeType
)
+
'
\t
'
+
str
(
timestamp
)
+
'
\n
'
fw
.
write
(
this_edge1
)
dstId2
=
pattern_dst2
.
findall
(
line
)
if
len
(
dstId2
)
>
0
and
dstId2
[
0
]
!=
'
null
'
:
dstId2
=
dstId2
[
0
]
if
not
dstId2
in
id_nodetype_map
.
keys
():
continue
dstType2
=
id_nodetype_map
[
dstId2
]
this_edge2
=
str
(
srcId
)
+
'
\t
'
+
str
(
srcType
)
+
'
\t
'
+
str
(
dstId2
)
+
'
\t
'
+
str
(
dstType2
)
+
'
\t
'
+
str
(
edgeType
)
+
'
\t
'
+
str
(
timestamp
)
+
'
\n
'
fw
.
write
(
this_edge2
)
fw
.
close
()
f
.
close
()
if
len
(
id_nodename_map
)
!=
0
:
fw
=
open
(
'
../data/{}/
'
.
format
(
dataset
)
+
'
names.json
'
,
'
w
'
,
encoding
=
'
utf-8
'
)
json
.
dump
(
id_nodename_map
,
fw
)
if
len
(
id_nodetype_map
)
!=
0
:
fw
=
open
(
'
../data/{}/
'
.
format
(
dataset
)
+
'
types.json
'
,
'
w
'
,
encoding
=
'
utf-8
'
)
json
.
dump
(
id_nodetype_map
,
fw
)
def
read_graphs
(
dataset
):
malicious_entities
=
'
../data/{}/{}.txt
'
.
format
(
dataset
,
dataset
)
f
=
open
(
malicious_entities
,
'
r
'
)
malicious_entities
=
set
()
for
l
in
f
.
readlines
():
malicious_entities
.
add
(
l
.
lstrip
().
rstrip
())
preprocess_dataset
(
dataset
)
train_gs
=
[]
for
file
in
metadata
[
dataset
][
'
train
'
]:
_
,
train_g
=
read_single_graph
(
dataset
,
malicious_entities
,
file
,
False
)
train_gs
.
append
(
train_g
)
test_gs
=
[]
test_node_map
=
{}
count_node
=
0
for
file
in
metadata
[
dataset
][
'
test
'
]:
node_map
,
test_g
=
read_single_graph
(
dataset
,
malicious_entities
,
file
,
True
)
assert
len
(
node_map
)
==
test_g
.
number_of_nodes
()
test_gs
.
append
(
test_g
)
for
key
in
node_map
:
if
key
not
in
test_node_map
:
test_node_map
[
key
]
=
node_map
[
key
]
+
count_node
count_node
+=
test_g
.
number_of_nodes
()
if
os
.
path
.
exists
(
'
../data/{}/names.json
'
.
format
(
dataset
))
and
os
.
path
.
exists
(
'
../data/{}/types.json
'
.
format
(
dataset
)):
with
open
(
'
../data/{}/names.json
'
.
format
(
dataset
),
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
id_nodename_map
=
json
.
load
(
f
)
with
open
(
'
../data/{}/types.json
'
.
format
(
dataset
),
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
id_nodetype_map
=
json
.
load
(
f
)
f
=
open
(
'
../data/{}/malicious_names.txt
'
.
format
(
dataset
),
'
w
'
,
encoding
=
'
utf-8
'
)
final_malicious_entities
=
[]
malicious_names
=
[]
for
e
in
malicious_entities
:
if
e
in
test_node_map
and
e
in
id_nodetype_map
and
id_nodetype_map
[
e
]
!=
'
MemoryObject
'
and
id_nodetype_map
[
e
]
!=
'
UnnamedPipeObject
'
:
final_malicious_entities
.
append
(
test_node_map
[
e
])
if
e
in
id_nodename_map
:
malicious_names
.
append
(
id_nodename_map
[
e
])
f
.
write
(
'
{}
\t
{}
\n
'
.
format
(
e
,
id_nodename_map
[
e
]))
else
:
malicious_names
.
append
(
e
)
f
.
write
(
'
{}
\t
{}
\n
'
.
format
(
e
,
e
))
else
:
f
=
open
(
'
../data/{}/malicious_names.txt
'
.
format
(
dataset
),
'
w
'
,
encoding
=
'
utf-8
'
)
final_malicious_entities
=
[]
malicious_names
=
[]
for
e
in
malicious_entities
:
if
e
in
test_node_map
:
final_malicious_entities
.
append
(
test_node_map
[
e
])
malicious_names
.
append
(
e
)
f
.
write
(
'
{}
\t
{}
\n
'
.
format
(
e
,
e
))
pkl
.
dump
((
final_malicious_entities
,
malicious_names
),
open
(
'
../data/{}/malicious.pkl
'
.
format
(
dataset
),
'
wb
'
))
pkl
.
dump
([
nx
.
node_link_data
(
train_g
)
for
train_g
in
train_gs
],
open
(
'
../data/{}/train.pkl
'
.
format
(
dataset
),
'
wb
'
))
pkl
.
dump
([
nx
.
node_link_data
(
test_g
)
for
test_g
in
test_gs
],
open
(
'
../data/{}/test.pkl
'
.
format
(
dataset
),
'
wb
'
))
if
__name__
==
'
__main__
'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'
CDM Parser
'
)
parser
.
add_argument
(
"
--dataset
"
,
type
=
str
,
default
=
"
trace
"
)
args
=
parser
.
parse_args
()
if
args
.
dataset
not
in
[
'
trace
'
,
'
theia
'
,
'
cadets
'
]:
raise
NotImplementedError
read_graphs
(
args
.
dataset
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment