{ "cells": [ { "cell_type": "markdown", "id": "driving-spider", "metadata": {}, "source": [ "####
KDD CUP 1999
" ] }, { "cell_type": "markdown", "id": "filled-objective", "metadata": {}, "source": [ "#### Load Training and Test Datta" ] }, { "cell_type": "code", "execution_count": 1, "id": "underlying-performer", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "training_data = pd.read_csv('train.txt', names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\"dst_bytes\",\"land\",\n", "\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\"logged_in\",\n", "\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\"num_file_creations\",\n", "\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\"is_host_login\",\n", "\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\", \"srv_serror_rate\",\n", "\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\", \"diff_srv_rate\", \"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\"dst_host_same_srv_rate\",\n", "\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n", "\"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n", "\"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"attack\", \"last_flag\"])\n", "\n", "test_data = pd.read_csv('test.txt', names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\"dst_bytes\",\"land\",\n", "\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\"logged_in\",\n", "\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\"num_file_creations\",\n", "\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\"is_host_login\",\n", "\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\", \"srv_serror_rate\",\n", "\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\", \"diff_srv_rate\", \"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\"dst_host_same_srv_rate\",\n", "\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n", "\"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n", "\"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"attack\", \"last_flag\"])" ] }, { "cell_type": "code", "execution_count": 2, "id": "announced-munich", "metadata": {}, "outputs": [], "source": [ "def get_label(label):\n", " if label == 'normal':\n", " return 1;\n", " else:\n", " return -1;" ] }, { "cell_type": "code", "execution_count": 3, "id": "junior-integrity", "metadata": {}, "outputs": [], "source": [ "train_y = training_data['attack'].apply(get_label)\n", "test_y = test_data['attack'].apply(get_label)" ] }, { "cell_type": "code", "execution_count": 4, "id": "referenced-cleveland", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationprotocol_typeserviceflagsrc_bytesdst_byteslandwrong_fragmenturgenthot...dst_host_same_srv_ratedst_host_diff_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_serror_ratedst_host_srv_serror_ratedst_host_rerror_ratedst_host_srv_rerror_rateattacklast_flag
00tcpftp_dataSF49100000...0.170.030.170.000.000.000.050.00normal20
10udpotherSF14600000...0.000.600.880.000.000.000.000.00normal15
20tcpprivateS0000000...0.100.050.000.001.001.000.000.00neptune19
30tcphttpSF23281530000...1.000.000.030.040.030.010.000.01normal21
40tcphttpSF1994200000...1.000.000.000.000.000.000.000.00normal21
\n", "

5 rows × 43 columns

\n", "
" ], "text/plain": [ " duration protocol_type service flag src_bytes dst_bytes land \\\n", "0 0 tcp ftp_data SF 491 0 0 \n", "1 0 udp other SF 146 0 0 \n", "2 0 tcp private S0 0 0 0 \n", "3 0 tcp http SF 232 8153 0 \n", "4 0 tcp http SF 199 420 0 \n", "\n", " wrong_fragment urgent hot ... dst_host_same_srv_rate \\\n", "0 0 0 0 ... 0.17 \n", "1 0 0 0 ... 0.00 \n", "2 0 0 0 ... 0.10 \n", "3 0 0 0 ... 1.00 \n", "4 0 0 0 ... 1.00 \n", "\n", " dst_host_diff_srv_rate dst_host_same_src_port_rate \\\n", "0 0.03 0.17 \n", "1 0.60 0.88 \n", "2 0.05 0.00 \n", "3 0.00 0.03 \n", "4 0.00 0.00 \n", "\n", " dst_host_srv_diff_host_rate dst_host_serror_rate \\\n", "0 0.00 0.00 \n", "1 0.00 0.00 \n", "2 0.00 1.00 \n", "3 0.04 0.03 \n", "4 0.00 0.00 \n", "\n", " dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate \\\n", "0 0.00 0.05 0.00 \n", "1 0.00 0.00 0.00 \n", "2 1.00 0.00 0.00 \n", "3 0.01 0.00 0.01 \n", "4 0.00 0.00 0.00 \n", "\n", " attack last_flag \n", "0 normal 20 \n", "1 normal 15 \n", "2 neptune 19 \n", "3 normal 21 \n", "4 normal 21 \n", "\n", "[5 rows x 43 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_data.head()" ] }, { "cell_type": "markdown", "id": "laden-inspiration", "metadata": {}, "source": [ "#### Visualization" ] }, { "cell_type": "code", "execution_count": 5, "id": "fuzzy-complaint", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 125973 entries, 0 to 125972\n", "Data columns (total 43 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 duration 125973 non-null int64 \n", " 1 protocol_type 125973 non-null object \n", " 2 service 125973 non-null object \n", " 3 flag 125973 non-null object \n", " 4 src_bytes 125973 non-null int64 \n", " 5 dst_bytes 125973 non-null int64 \n", " 6 land 125973 non-null int64 \n", " 7 wrong_fragment 125973 non-null int64 \n", " 8 urgent 125973 non-null int64 \n", " 9 hot 125973 non-null int64 \n", " 10 num_failed_logins 125973 non-null int64 \n", " 11 logged_in 125973 non-null int64 \n", " 12 num_compromised 125973 non-null int64 \n", " 13 root_shell 125973 non-null int64 \n", " 14 su_attempted 125973 non-null int64 \n", " 15 num_root 125973 non-null int64 \n", " 16 num_file_creations 125973 non-null int64 \n", " 17 num_shells 125973 non-null int64 \n", " 18 num_access_files 125973 non-null int64 \n", " 19 num_outbound_cmds 125973 non-null int64 \n", " 20 is_host_login 125973 non-null int64 \n", " 21 is_guest_login 125973 non-null int64 \n", " 22 count 125973 non-null int64 \n", " 23 srv_count 125973 non-null int64 \n", " 24 serror_rate 125973 non-null float64\n", " 25 srv_serror_rate 125973 non-null float64\n", " 26 rerror_rate 125973 non-null float64\n", " 27 srv_rerror_rate 125973 non-null float64\n", " 28 same_srv_rate 125973 non-null float64\n", " 29 diff_srv_rate 125973 non-null float64\n", " 30 srv_diff_host_rate 125973 non-null float64\n", " 31 dst_host_count 125973 non-null int64 \n", " 32 dst_host_srv_count 125973 non-null int64 \n", " 33 dst_host_same_srv_rate 125973 non-null float64\n", " 34 dst_host_diff_srv_rate 125973 non-null float64\n", " 35 dst_host_same_src_port_rate 125973 non-null float64\n", " 36 dst_host_srv_diff_host_rate 125973 non-null float64\n", " 37 dst_host_serror_rate 125973 non-null float64\n", " 38 dst_host_srv_serror_rate 125973 non-null float64\n", " 39 dst_host_rerror_rate 125973 non-null float64\n", " 40 dst_host_srv_rerror_rate 125973 non-null float64\n", " 41 attack 125973 non-null object \n", " 42 last_flag 125973 non-null int64 \n", "dtypes: float64(15), int64(24), object(4)\n", "memory usage: 41.3+ MB\n" ] } ], "source": [ "training_data.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "independent-booking", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationsrc_bytesdst_byteslandwrong_fragmenturgenthotnum_failed_loginslogged_innum_compromised...dst_host_srv_countdst_host_same_srv_ratedst_host_diff_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_serror_ratedst_host_srv_serror_ratedst_host_rerror_ratedst_host_srv_rerror_ratelast_flag
count125973.000001.259730e+051.259730e+05125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000...125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000125973.000000
mean287.144654.556674e+041.977911e+040.0001980.0226870.0001110.2044090.0012220.3957360.279250...115.6530050.5212420.0829510.1483790.0325420.2844520.2784850.1188320.12024019.504060
std2604.515315.870331e+064.021269e+060.0140860.2535300.0143662.1499680.0452390.48901023.942042...110.7027410.4489490.1889220.3089970.1125640.4447840.4456690.3065570.3194592.291503
min0.000000.000000e+000.000000e+000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.000000.000000e+000.000000e+000.0000000.0000000.0000000.0000000.0000000.0000000.000000...10.0000000.0500000.0000000.0000000.0000000.0000000.0000000.0000000.00000018.000000
50%0.000004.400000e+010.000000e+000.0000000.0000000.0000000.0000000.0000000.0000000.000000...63.0000000.5100000.0200000.0000000.0000000.0000000.0000000.0000000.00000020.000000
75%0.000002.760000e+025.160000e+020.0000000.0000000.0000000.0000000.0000001.0000000.000000...255.0000001.0000000.0700000.0600000.0200001.0000001.0000000.0000000.00000021.000000
max42908.000001.379964e+091.309937e+091.0000003.0000003.00000077.0000005.0000001.0000007479.000000...255.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.00000021.000000
\n", "

8 rows × 39 columns

\n", "
" ], "text/plain": [ " duration src_bytes dst_bytes land \\\n", "count 125973.00000 1.259730e+05 1.259730e+05 125973.000000 \n", "mean 287.14465 4.556674e+04 1.977911e+04 0.000198 \n", "std 2604.51531 5.870331e+06 4.021269e+06 0.014086 \n", "min 0.00000 0.000000e+00 0.000000e+00 0.000000 \n", "25% 0.00000 0.000000e+00 0.000000e+00 0.000000 \n", "50% 0.00000 4.400000e+01 0.000000e+00 0.000000 \n", "75% 0.00000 2.760000e+02 5.160000e+02 0.000000 \n", "max 42908.00000 1.379964e+09 1.309937e+09 1.000000 \n", "\n", " wrong_fragment urgent hot num_failed_logins \\\n", "count 125973.000000 125973.000000 125973.000000 125973.000000 \n", "mean 0.022687 0.000111 0.204409 0.001222 \n", "std 0.253530 0.014366 2.149968 0.045239 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 0.000000 0.000000 \n", "max 3.000000 3.000000 77.000000 5.000000 \n", "\n", " logged_in num_compromised ... dst_host_srv_count \\\n", "count 125973.000000 125973.000000 ... 125973.000000 \n", "mean 0.395736 0.279250 ... 115.653005 \n", "std 0.489010 23.942042 ... 110.702741 \n", "min 0.000000 0.000000 ... 0.000000 \n", "25% 0.000000 0.000000 ... 10.000000 \n", "50% 0.000000 0.000000 ... 63.000000 \n", "75% 1.000000 0.000000 ... 255.000000 \n", "max 1.000000 7479.000000 ... 255.000000 \n", "\n", " dst_host_same_srv_rate dst_host_diff_srv_rate \\\n", "count 125973.000000 125973.000000 \n", "mean 0.521242 0.082951 \n", "std 0.448949 0.188922 \n", "min 0.000000 0.000000 \n", "25% 0.050000 0.000000 \n", "50% 0.510000 0.020000 \n", "75% 1.000000 0.070000 \n", "max 1.000000 1.000000 \n", "\n", " dst_host_same_src_port_rate dst_host_srv_diff_host_rate \\\n", "count 125973.000000 125973.000000 \n", "mean 0.148379 0.032542 \n", "std 0.308997 0.112564 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", "75% 0.060000 0.020000 \n", "max 1.000000 1.000000 \n", "\n", " dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate \\\n", "count 125973.000000 125973.000000 125973.000000 \n", "mean 0.284452 0.278485 0.118832 \n", "std 0.444784 0.445669 0.306557 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 0.000000 \n", "max 1.000000 1.000000 1.000000 \n", "\n", " dst_host_srv_rerror_rate last_flag \n", "count 125973.000000 125973.000000 \n", "mean 0.120240 19.504060 \n", "std 0.319459 2.291503 \n", "min 0.000000 0.000000 \n", "25% 0.000000 18.000000 \n", "50% 0.000000 20.000000 \n", "75% 0.000000 21.000000 \n", "max 1.000000 21.000000 \n", "\n", "[8 rows x 39 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_data.describe()" ] }, { "cell_type": "code", "execution_count": 7, "id": "developing-greeting", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " , ,\n", " , ]], dtype=object)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "training_data.hist(bins = 50, figsize = (20, 15))" ] }, { "cell_type": "code", "execution_count": 8, "id": "advisory-boost", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',\n", " 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',\n", " 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',\n", " 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',\n", " 'num_access_files', 'num_outbound_cmds', 'is_host_login',\n", " 'is_guest_login', 'count', 'srv_count', 'serror_rate',\n", " 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',\n", " 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',\n", " 'dst_host_srv_count', 'dst_host_same_srv_rate',\n", " 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',\n", " 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',\n", " 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',\n", " 'dst_host_srv_rerror_rate', 'attack', 'last_flag'],\n", " dtype='object')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_data.columns" ] }, { "cell_type": "code", "execution_count": 9, "id": "informal-advertiser", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "normal 67343\n", "neptune 41214\n", "satan 3633\n", "ipsweep 3599\n", "portsweep 2931\n", "smurf 2646\n", "nmap 1493\n", "back 956\n", "teardrop 892\n", "warezclient 890\n", "pod 201\n", "guess_passwd 53\n", "buffer_overflow 30\n", "warezmaster 20\n", "land 18\n", "imap 11\n", "rootkit 10\n", "loadmodule 9\n", "ftp_write 8\n", "multihop 7\n", "phf 4\n", "perl 3\n", "spy 2\n", "Name: attack, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_data['attack'].value_counts()" ] }, { "cell_type": "markdown", "id": "outer-indicator", "metadata": {}, "source": [ "#### Preprocessing" ] }, { "cell_type": "code", "execution_count": 10, "id": "intellectual-picnic", "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import QuantileTransformer\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": 11, "id": "minimal-deputy", "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "class Get_top_categories(BaseEstimator, TransformerMixin):\n", " \"\"\"Create a class to keep the top categories, the rest categories are labeled as 'other'\n", " \"\"\"\n", " \n", " def __init__(self, top_num = 10): # no *args or **kargs\n", " \"\"\"Create a class\n", " \n", " Arg:\n", " top_num (int), the number of top categories kept, default number is 10\n", " \"\"\"\n", " self.top_num = top_num\n", " \n", " def fit(self, X, y = None):\n", " \"\"\"Fit the class\n", " \n", " Arg:\n", " X (Pandas.Series), a column of a Pandas.DataFrame\n", " y (None), not used\n", " \"\"\"\n", " temp = X.value_counts()\n", " self.columns = list(temp[:self.top_num].index)\n", " return self\n", " \n", " def containe(self, s):\n", " \"\"\"Process record\n", " \n", " Arg:\n", " s (str), a recod in the categorical column\n", " \n", " Return:\n", " str, return the same string is a recod in the top category list; otherwise, return 'other'\n", " \"\"\"\n", " if s in self.columns:\n", " return s\n", " else:\n", " return 'other_category'\n", " \n", " def transform(self, X):\n", " \"\"\"Convert a specific categorical column\n", " \n", " Arg:\n", " X (Pandas.Series), a column of a Pandas.DataFrame\n", " \n", " Return:\n", " Pandas.Series, processed column\n", " \"\"\"\n", " temp = X.apply(self.containe)\n", " return temp" ] }, { "cell_type": "code", "execution_count": 12, "id": "saved-musical", "metadata": {}, "outputs": [], "source": [ "class DoNothing(BaseEstimator, TransformerMixin):\n", " \"\"\"Do not change anything\"\"\"\n", " def __init__(self):\n", " pass\n", " def fit(self, X, y=None):\n", " return self\n", " def transform(self, X):\n", " temp = X.copy()\n", " return temp" ] }, { "cell_type": "code", "execution_count": 13, "id": "toxic-certification", "metadata": {}, "outputs": [], "source": [ "# process numerical features\n", "num_pipeline = Pipeline([\n", " ('std_scaler', StandardScaler()),\n", "])\n", "\n", "num_pipeline_gaussian = Pipeline([\n", " ('quantile', QuantileTransformer(output_distribution='normal', random_state=0)),\n", " #('std_scaler', StandardScaler()), \n", "])" ] }, { "cell_type": "code", "execution_count": 14, "id": "unable-image", "metadata": {}, "outputs": [], "source": [ "# process categorical features with bag of words\n", "cat_pipeline = Pipeline([\n", " ('bag_of_words', CountVectorizer()),\n", "])\n", "\n", "cat_pipeline_five = Pipeline([\n", " ('more_than_five', Get_top_categories(top_num=5)),\n", " ('bag_of_words', CountVectorizer()), \n", "])\n", "\n", "cat_pipeline_ten = Pipeline([\n", " ('more_than_ten', Get_top_categories()),\n", " ('bag_of_words', CountVectorizer()), \n", "])" ] }, { "cell_type": "code", "execution_count": 15, "id": "charged-applicant", "metadata": {}, "outputs": [], "source": [ "# do not change features\n", "do_nothing_pipeline = Pipeline([\n", " ('do_nothing', DoNothing())\n", "])" ] }, { "cell_type": "code", "execution_count": 16, "id": "straight-intake", "metadata": {}, "outputs": [], "source": [ "from sklearn.compose import ColumnTransformer\n", "\n", "preprocess_pipeline = ColumnTransformer([\n", " (\"num_pipeline_guassion\", num_pipeline_gaussian, ['duration', 'src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']), # 3, pass a DataFrame to num_pipeline\n", " (\"cat_pipeline_protocol_type\", cat_pipeline, 'protocol_type'), # 3, pass a Series to cat_pipeline\n", " (\"cat_pipeline_service\", cat_pipeline_ten, 'service'), # 11, pass a Series to cat_pipeline_ten \n", " (\"cat_pipeline_flag\", cat_pipeline_five, 'flag'), # 6, pass a Series to cat_pipeline_ten\n", " (\"do_nothing\", do_nothing_pipeline, ['land', 'wrong_fragment', 'urgent', 'num_failed_logins', 'logged_in', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']) # 1, pass a DataFrame to num_pipeline\n", " ])" ] }, { "cell_type": "code", "execution_count": 17, "id": "fourth-texture", "metadata": {}, "outputs": [], "source": [ "# num_pipeline_gaussian, 12\n", "# cat_pipeline_protocol_type, 3\n", "# cat_pipeline_service, 11\n", "# cat_pipeline_flag, 6\n", "# do_nothing, 26\n", "train_x = preprocess_pipeline.fit_transform(training_data)\n", "column1 = ['duration', 'src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']\n", "column2 = ['p0', 'p1', 'p2']\n", "column3 = ['s0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10']\n", "column4 = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5']\n", "column5 = ['land', 'wrong_fragment', 'urgent', 'num_failed_logins', 'logged_in', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']\n", "columns = column1+column2+column3+column4+column5\n", "train_x = pd.DataFrame(train_x, columns=columns)" ] }, { "cell_type": "code", "execution_count": 18, "id": "ef46b640", "metadata": {}, "outputs": [], "source": [ "test_x = preprocess_pipeline.transform(test_data)" ] }, { "cell_type": "markdown", "id": "6fa1a184", "metadata": {}, "source": [ "#### Local Outlier Factor" ] }, { "cell_type": "code", "execution_count": 76, "id": "732195c1", "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import LocalOutlierFactor\n", "lof = LocalOutlierFactor(novelty=True, n_neighbors = 200, algorithm = 'auto', metric = 'manhattan')\n", "lof.fit(train_x[train_y==1])\n", "predict = lof.predict(train_x)" ] }, { "cell_type": "code", "execution_count": 77, "id": "9d7a768e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Malicious_trueNormal_true
Malicious_true561472483
Normal_true732860015
\n", "
" ], "text/plain": [ " Malicious_true Normal_true\n", "Malicious_true 56147 2483\n", "Normal_true 7328 60015" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import confusion_matrix\n", "cf_matrix = confusion_matrix(train_y, predict)\n", "pd.DataFrame(cf_matrix, index = ['Malicious_true', 'Normal_true'], columns = ['Malicious_true', 'Normal_true'])" ] }, { "cell_type": "code", "execution_count": 78, "id": "bdedc3f9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " normal 0.88 0.96 0.92 58630\n", " malicious 0.96 0.89 0.92 67343\n", "\n", " accuracy 0.92 125973\n", " macro avg 0.92 0.92 0.92 125973\n", "weighted avg 0.93 0.92 0.92 125973\n", "\n" ] } ], "source": [ "from sklearn.metrics import classification_report\n", "print(classification_report(train_y, predict, target_names=['normal', 'malicious']))" ] }, { "cell_type": "code", "execution_count": 79, "id": "2e4cdf74", "metadata": {}, "outputs": [], "source": [ "test_pred = lof.predict(test_x)" ] }, { "cell_type": "code", "execution_count": 80, "id": "4c8b024f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " normal 0.88 0.88 0.88 12833\n", " malicious 0.84 0.84 0.84 9711\n", "\n", " accuracy 0.86 22544\n", " macro avg 0.86 0.86 0.86 22544\n", "weighted avg 0.86 0.86 0.86 22544\n", "\n" ] } ], "source": [ "print(classification_report(test_y, test_pred, target_names=['normal', 'malicious']))" ] }, { "cell_type": "markdown", "id": "outstanding-validity", "metadata": {}, "source": [ "#### Reference\n", "* Network Anomaly Detection" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 5 }