diff --git a/Flows-to-Graphs/migrate.ini b/Flows-to-Graphs/migrate.ini new file mode 100644 index 0000000000000000000000000000000000000000..9d6e38843c28c0e7f271aee5bbf6bd29c43f2d31 --- /dev/null +++ b/Flows-to-Graphs/migrate.ini @@ -0,0 +1,12 @@ +[DIRECTORIES] + +# PATH of the original (not IGNNITION compatible) dataset +#original_dataset_path: ../preprocess_dataset/preprocessed_IDS2017/TRAIN +original_dataset_path: ./ + +# Output PATH of the migrated dataset +#output_path: ./data/train +output_path: ./data/test + + + diff --git a/Flows-to-Graphs/migrate.py b/Flows-to-Graphs/migrate.py new file mode 100644 index 0000000000000000000000000000000000000000..feb88543bc738315fc6cbb0ca45f26580f4cda62 --- /dev/null +++ b/Flows-to-Graphs/migrate.py @@ -0,0 +1,233 @@ +""" + Copyright 2020 Universitat Politècnica de Catalunya + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import tarfile +import networkx as nx +from random import random +import json +from networkx.readwrite import json_graph +import os +import csv +import sys + +import numpy as np +import networkx as nx +import matplotlib.pyplot as plt +import tensorflow as tf +import glob +import configparser + +params_norm = configparser.ConfigParser() +params_norm._interpolation = configparser.ExtendedInterpolation() +params_norm.read('./normalization_parameters.ini') + +params = configparser.ConfigParser() +params._interpolation = configparser.ExtendedInterpolation() +params.read('./migrate.ini') + + +# -------------------------------------- +# IDS 2017 + +# MAP THAT TELLS US, GIVEN A FEATURE, ITS POSITION (IDS 2017) +features = ['Flow ID','Source IP','Source Port','Destination IP','Destination Port','Protocol','Timestamp','Flow Duration','Total Fwd Packets','Total Backward Packets','Total Length of Fwd Packets','Total Length of Bwd Packets','Fwd Packet Length Max','Fwd Packet Length Min','Fwd Packet Length Mean','Fwd Packet Length Std','Bwd Packet Length Max','Bwd Packet Length Min','Bwd Packet Length Mean','Bwd Packet Length Std','Flow Bytes/s','Flow Packets/s','Flow IAT Mean','Flow IAT Std','Flow IAT Max','Flow IAT Min','Fwd IAT Total','Fwd IAT Mean','Fwd IAT Std','Fwd IAT Max','Fwd IAT Min','Bwd IAT Total','Bwd IAT Mean','Bwd IAT Std','Bwd IAT Max','Bwd IAT Min','Fwd PSH Flags','Bwd PSH Flags','Fwd URG Flags','Bwd URG Flags','Fwd Header Length','Bwd Header Length','Fwd Packets/s','Bwd Packets/s','Min Packet Length','Max Packet Length','Packet Length Mean','Packet Length Std','Packet Length Variance','FIN Flag Count','SYN Flag Count','RST Flag Count','PSH Flag Count','ACK Flag Count','URG Flag Count','CWE Flag Count','ECE Flag Count','Down/Up Ratio','Average Packet Size','Avg Fwd Segment Size','Avg Bwd Segment Size','Fwd Avg Bytes/Bulk','Fwd Avg Packets/Bulk','Fwd Avg Bulk Rate','Bwd Avg Bytes/Bulk','Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets','Subflow Fwd Bytes','Subflow Bwd Packets','Subflow Bwd Bytes','Init_Win_bytes_forward','Init_Win_bytes_backward','act_data_pkt_fwd','min_seg_size_forward','Active Mean','Active Std','Active Max','Active Min','Idle Mean','Idle Std','Idle Max','Idle Min','Label'] +indices = range(len(features)) +zip_iterator = zip(features,indices) +features_dict = dict(zip_iterator) + + +# ATTACKS IDS 2017 +attack_names = ['HTTP Get Flood','ICMP Frag Flood','TCP Flood','UDP Flood','Port Scanning','Brute Force','Normal'] +indices = range(len(attack_names)) +zip_iterator = zip(attack_names,indices) +attacks_dict = dict(zip_iterator) + + +chosen_connection_features = ['Source Port', 'Destination Port', 'Protocol', 'Bwd Packet Length Min', 'Subflow Fwd Packets', + 'Total Length of Fwd Packets', 'Fwd Packet Length Mean', 'Total Length of Fwd Packets', + 'Fwd Packet Length Std', 'Fwd IAT Min', 'Flow IAT Min', 'Flow IAT Mean', 'Bwd Packet Length Std', + 'Subflow Fwd Bytes', 'Flow Duration', 'Flow IAT Std', 'Active Min','Active Mean', 'Bwd IAT Mean', + 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'ACK Flag Count','Fwd PSH Flags','SYN Flag Count', + 'Flow Packets/s', 'PSH Flag Count', 'Average Packet Size'] + +indices = range(len(chosen_connection_features)) +zip_iterator = zip(chosen_connection_features, indices) +chosen_features_dict = dict(zip_iterator) + + +# possible_protocols = {'6':[0.0,0.0,1.0],'17':[0.0,1.0,0.0], '0':[1.0,0.0,0.0],'':[0.0,0.0,0.0]} + +# -------------------------------------- + +def normalization_function(feature, name): + if name in chosen_connection_features and (name+'_mean') in params_norm['PARAMS'] and float(params_norm['PARAMS'][name + '_mean']) != 0: + feature = (feature - float(params_norm['PARAMS'][name + '_mean'])) / float(params_norm['PARAMS'][name + '_std']) + return feature + + +def transform_ips(ip): + # transform it into a 12 bit string + ip = ip.split('.') + for i in range(len(ip)): + ip[i] = '0'*(3 - len(ip[i])) + ip[i] + + ip = ''.join(ip) + try: + result = [float(v) for v in ip if v != '.'] + except: + result = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] + + return result + + +def get_feature(trace, feature_name, parse=True): + if parse: + if feature_name == 'Label': + attack = trace[-1] + return attacks_dict.get(attack) + else: + idx = features_dict[feature_name] + feature = trace[idx] + + if 'ID' in feature_name: + return feature + elif 'IP' in feature_name: + return transform_ips(feature) + # elif feature_name == 'Protocol': + # # Transform to a one-hot encoding + # return possible_protocols.get(feature) + else: + try: + value = float(feature) + if value != float('+inf') and value != float('nan'): + return value + else: + return 0 + except: + return 0 + else: + idx = features_dict[feature_name] + return trace[idx] + +# constructs a dictionary with all the chosen features of the ids 2017 +def get_connection_features(trace, final_feature): + connection_features = {} + aux = [] + for f in chosen_connection_features: + feat = get_feature(trace, f) + norm_feats = normalization_function(feat, f) + aux.append(norm_feats) + + connection_features['Label'] = final_feature + connection_features['conect_feats'] = aux + return connection_features + + +def traces_to_graph(traces): + G = nx.DiGraph() + # G = nx.MultiDiGraph() + # G = nx.MultiGraph() + n = len(traces) + for i in range(n): + trace = traces[i] + + dst_name = 'Destination IP' + src_name = 'Source IP' + + # For now we create the IP features as a list of 128 + if get_feature(trace, dst_name, parse=False) not in G.nodes(): + G.add_node(get_feature(trace, dst_name, parse=False), entity='ip', ip_feats = list(np.ones(128))) + + if get_feature(trace, src_name, parse=False) not in G.nodes(): + G.add_node(get_feature(trace, src_name, parse=False), entity='ip', ip_feats = list(np.ones(128))) + + label_num = get_feature(trace, 'Label') + final_label = np.zeros(7) + if label_num != -1: # if it is an attack + final_label[label_num] = 1 + final_label = final_label.tolist() + + connection_features = get_connection_features(trace, final_label) + connection_features['entity'] = 'connection' + G.add_node('con_' + str(i), **connection_features) + + # these edges connect the ports with the IP node (connecting all the servers together) + G.add_edge('con_' + str(i), get_feature(trace, dst_name, parse=False)) + G.add_edge('con_' + str(i), get_feature(trace, src_name, parse=False)) + G.add_edge(get_feature(trace, dst_name, parse=False), 'con_' + str(i)) + G.add_edge(get_feature(trace, src_name, parse=False), 'con_' + str(i)) + return G + +# This function must return the corresponding graphs +def generator(path): + files = glob.glob(path + '/*.csv') + for file in files: + print("Processing file:", file) + with open(file, encoding="utf8", errors='ignore') as csvfile: + data = csv.reader(csvfile, delimiter=',', quotechar='|') + current_time_traces = [] + counter = 0 + for row in data: + if len(row) > 1: + # remains to fix this criterion (for now we set the windows to be 200 connections big) + if counter >= 100: + if current_time_traces != []: + G = traces_to_graph(current_time_traces) + yield G + + counter = 0 + current_time_traces = [] + + current_time_traces.append(row) + counter += 1 + +def migrate_dataset(input_path, output_path, max_per_file=999999): + print("Starting to do the migration...") + + gen = generator(input_path) + data = [] + file_ctr = 0 + counter = 0 + + while True: + try: + G = next(gen) + parser_graph = json_graph.node_link_data(G) + data.append(parser_graph) + # nx.draw(G, with_labels = True) #test + # plt.savefig("graph_" + str(counter) + "_" + str(file_ctr) + ".png") #test + # plt.clf() #test + if max_per_file is not None and counter == max_per_file: + with open(output_path + '/data_' + str(file_ctr) + '.json', 'w') as json_file: + json.dump(data, json_file) + + data = [] + counter = 0 + file_ctr += 1 + else: + counter +=1 + + #when finished, save all the remaining ones + except: + with open(output_path + '/data_' + str(file_ctr) + '.json', 'w') as json_file: + json.dump(data, json_file) + return + +if __name__ == "__main__": + input_path = os.path.abspath(params['DIRECTORIES']['original_dataset_path']) + output_path = os.path.abspath(params['DIRECTORIES']['output_path']) + + # Create the output directories if necessary + if not os.path.exists(output_path): + os.makedirs(output_path) + + migrate_dataset(input_path, output_path) diff --git a/Flows-to-Graphs/normalization_parameters.ini b/Flows-to-Graphs/normalization_parameters.ini new file mode 100644 index 0000000000000000000000000000000000000000..366a53871ce9f2cf3533e552b6cee024e4d3f5f4 --- /dev/null +++ b/Flows-to-Graphs/normalization_parameters.ini @@ -0,0 +1,125 @@ +[PARAMS] +Source Port_mean : 18167.63276 +Destination Port_mean : 16094.79501 +Protocol_mean : 0.447406838 +Flow Duration_mean : 13610976.15 +Total Fwd Packets_mean : 646.8312988 +Total Backward Packets_mean : 955.8530285 +Total Length of Fwd Packets_mean : 740210.5056 +Total Length of Bwd Packets_mean : 1358409.19 +Fwd Packet Length Max_mean : 252.4158673 +Fwd Packet Length Min_mean : 67.52189701 +Fwd Packet Length Mean_mean : 117.2691805 +Fwd Packet Length Std_mean : 104.0511745 +Bwd Packet Length Max_mean : 429.795201 +Bwd Packet Length Min_mean : 70.07692911 +Bwd Packet Length Mean_mean : 254.6167126 +Bwd Packet Length Std_mean : 187.9319257 +Flow Packets/s_mean : 17289.45029 +Flow IAT Mean_mean : 2470068.371 +Flow IAT Std_mean : 1028601.795 +Flow IAT Max_mean : 5983190.561 +Flow IAT Min_mean : 2430760.049 +Fwd IAT Total_mean : 13313137.19 +Fwd IAT Mean_mean : 692093.849 +Fwd IAT Std_mean : 1244410.611 +Fwd IAT Max_mean : 5561940.112 +Fwd IAT Min_mean : 429052.5789 +Bwd IAT Total_mean : 11865658.7 +Bwd IAT Mean_mean : 660545.2231 +Bwd IAT Std_mean : 1296381.833 +Bwd IAT Max_mean : 4841263.865 +Bwd IAT Min_mean : 340033.4224 +Fwd Header Length_mean : 12935.77323 +Bwd Header Length_mean : 19115.75058 +Fwd Packets/s_mean : 9129.609587 +Bwd Packets/s_mean : 8288.446874 +Min Packet Length_mean : 31.64339014 +Max Packet Length_mean : 435.3076046 +Packet Length Mean_mean : 172.6083011 +Packet Length Std_mean : 178.2846176 +Packet Length Variance_mean : 111099.4291 +FIN Flag Count_mean : 0.224771763 +SYN Flag Count_mean : 0.499794728 +RST Flag Count_mean : 0.123331882 +PSH Flag Count_mean : 0.163616302 +ACK Flag Count_mean : 0.499997159 +Down/Up Ratio_mean : 1.331238751 +Average Packet Size_mean : 186.2530563 +Avg Fwd Segment Size_mean : 117.2691805 +Avg Bwd Segment Size_mean : 254.6167126 +Subflow Bwd Packets_mean : 646.8312988 +Subflow Bwd Bytes_mean : 740210.5056 +Init_Win_bytes_backward_mean : 29654.02749 +act_data_pkt_fwd_mean : 507.7052312 +Active Mean_mean : 2331317.526 +Active Std_mean : 1571032.325 +Active Max_mean : 3443968.906 +Active Min_mean : 1897302.523 +Idle Mean_mean : 4620657.243 +Idle Std_mean : 2167540.689 +Idle Max_mean : 5752134.989 +Idle Min_mean : 3982483.244 + + +Source Port_std : 44061.95506 +Destination Port_std : 6209.519101 +Protocol_std : 6.010393258 +Flow Duration_std : 2897878.887 +Total Fwd Packets_std : 45.6011236 +Total Backward Packets_std : 68.79353933 +Total Length of Fwd Packets_std : 24903.37472 +Total Length of Bwd Packets_std : 83519.32163 +Fwd Packet Length Max_std : 151.0941011 +Fwd Packet Length Min_std : 15.09269663 +Fwd Packet Length Mean_std : 60.31023126 +Fwd Packet Length Std_std : 65.91337046 +Bwd Packet Length Max_std : 316.7985955 +Bwd Packet Length Min_std : 4.556741573 +Bwd Packet Length Mean_std : 115.2953941 +Bwd Packet Length Std_std : 134.7314455 +Flow Packets/s_std : 1076.759428 +Flow IAT Mean_std : 315402.3116 +Flow IAT Std_std : 316010.858 +Flow IAT Max_std : 1390749.078 +Flow IAT Min_std : 169225.2489 +Fwd IAT Total_std : 2356534 +Fwd IAT Mean_std : 193229.1379 +Fwd IAT Std_std : 199001.5513 +Fwd IAT Max_std : 972956.4795 +Fwd IAT Min_std : 102790.4461 +Bwd IAT Total_std : 2129542.496 +Bwd IAT Mean_std : 183055.4255 +Bwd IAT Std_std : 275767.2574 +Bwd IAT Max_std : 938252.2438 +Bwd IAT Min_std : 42909.85449 +Fwd Header Length_std : 925.0988764 +Bwd Header Length_std : 1410.752809 +Fwd Packets/s_std : 570.2192404 +Bwd Packets/s_std : 506.5401875 +Min Packet Length_std : 1.83988764 +Max Packet Length_std : 333.5238764 +Packet Length Mean_std : 86.7986432 +Packet Length Std_std : 129.5051456 +Packet Length Variance_std : 48556.98763 +FIN Flag Count_std : 0.053370787 +SYN Flag Count_std : 0.485674157 +RST Flag Count_std : 0.015449438 +PSH Flag Count_std : 0.02752809 +ACK Flag Count_std : 0.498314607 +Down/Up Ratio_std : 1.178932584 +Average Packet Size_std : 96.07454031 +Avg Fwd Segment Size_std : 60.31023126 +Avg Bwd Segment Size_std : 115.2953941 +Subflow Bwd Packets_std : 45.6011236 +Subflow Bwd Bytes_std : 24903.37472 +Init_Win_bytes_backward_std : 20786.8559 +act_data_pkt_fwd_std : 18.74494382 +Active Mean_std : 192130.6199 +Active Std_std : 107419.2795 +Active Max_std : 295761.361 +Active Min_std : 114022.5817 +Idle Mean_std : 543921.4623 +Idle Std_std : 186999.9591 +Idle Max_std : 700302.3222 +Idle Min_std : 405814.1747 \ No newline at end of file