import pandas as pd
training_data = pd.read_csv('train.txt', names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
"wrong_fragment","urgent","hot","num_failed_logins","logged_in",
"num_compromised","root_shell","su_attempted","num_root","num_file_creations",
"num_shells","num_access_files","num_outbound_cmds","is_host_login",
"is_guest_login","count","srv_count","serror_rate", "srv_serror_rate",
"rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate", "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
"dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"])
training_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 125973 entries, 0 to 125972 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration 125973 non-null int64 1 protocol_type 125973 non-null object 2 service 125973 non-null object 3 flag 125973 non-null object 4 src_bytes 125973 non-null int64 5 dst_bytes 125973 non-null int64 6 land 125973 non-null int64 7 wrong_fragment 125973 non-null int64 8 urgent 125973 non-null int64 9 hot 125973 non-null int64 10 num_failed_logins 125973 non-null int64 11 logged_in 125973 non-null int64 12 num_compromised 125973 non-null int64 13 root_shell 125973 non-null int64 14 su_attempted 125973 non-null int64 15 num_root 125973 non-null int64 16 num_file_creations 125973 non-null int64 17 num_shells 125973 non-null int64 18 num_access_files 125973 non-null int64 19 num_outbound_cmds 125973 non-null int64 20 is_host_login 125973 non-null int64 21 is_guest_login 125973 non-null int64 22 count 125973 non-null int64 23 srv_count 125973 non-null int64 24 serror_rate 125973 non-null float64 25 srv_serror_rate 125973 non-null float64 26 rerror_rate 125973 non-null float64 27 srv_rerror_rate 125973 non-null float64 28 same_srv_rate 125973 non-null float64 29 diff_srv_rate 125973 non-null float64 30 srv_diff_host_rate 125973 non-null float64 31 dst_host_count 125973 non-null int64 32 dst_host_srv_count 125973 non-null int64 33 dst_host_same_srv_rate 125973 non-null float64 34 dst_host_diff_srv_rate 125973 non-null float64 35 dst_host_same_src_port_rate 125973 non-null float64 36 dst_host_srv_diff_host_rate 125973 non-null float64 37 dst_host_serror_rate 125973 non-null float64 38 dst_host_srv_serror_rate 125973 non-null float64 39 dst_host_rerror_rate 125973 non-null float64 40 dst_host_srv_rerror_rate 125973 non-null float64 41 attack 125973 non-null object 42 last_flag 125973 non-null int64 dtypes: float64(15), int64(24), object(4) memory usage: 41.3+ MB
training_data['protocol_type'].value_counts()
tcp 102689 udp 14993 icmp 8291 Name: protocol_type, dtype: int64
training_data['service'].value_counts()
http 40338 private 21853 domain_u 9043 smtp 7313 ftp_data 6860 ... tftp_u 3 aol 2 http_8001 2 harvest 2 http_2784 1 Name: service, Length: 70, dtype: int64
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
encoder.fit_transform(training_data[['service']])
import category_encoders as ce
encoder = ce.ordinal.OrdinalEncoder()
encoder.fit_transform(training_data[['service']])
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(training_data[['service']])
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit_transform(training_data[['protocol_type']])
import category_encoders as ce
encoder = ce.one_hot.OneHotEncoder()
encoder.fit_transform(training_data[['protocol_type']])
import category_encoders as ce
encoder = ce.HashingEncoder(n_components=8) # use 8 bins
encoder.fit_transform(training_data[['service']])
import category_encoders as ce
encoder = ce.count.CountEncoder()
encoder.fit_transform(training_data[['service']])
from feature_engine.encoding import CountFrequencyEncoder
encoder = CountFrequencyEncoder(encoding_method='count')
encoder.fit_transform(training_data[['service']])
# Frequency Encoder
from feature_engine.encoding import CountFrequencyEncoder
encoder = CountFrequencyEncoder(encoding_method='frequency')
encoder.fit_transform(training_data[['service']])
data=pd.DataFrame({'City':['Delhi','Mumbai','Hyderabad','Chennai','Bangalore','Delhi','Hyderabad']})
display(data)
encoder=ce.sum_coding.SumEncoder(cols='City',verbose=False,)
encoder.fit_transform(data)
import category_encoders as ce
encoder = ce.helmert.HelmertEncoder()
encoder.fit_transform(training_data[['service']])
import category_encoders as ce
encoder = ce.binary.BinaryEncoder()
encoder.fit_transform(training_data[['service']])
import category_encoders as ce
encoder = ce.basen.BaseNEncoder(base = 5)
encoder.fit_transform(training_data[['service']])
from feature_engine.encoding import RareLabelEncoder
# tol, the minimum frequency a label should have to be considered frequent
# n_categories, the minimum number of categories a variable should have for the encoder to find frequent labels
encoder = RareLabelEncoder(n_categories = 10, tol=0.05)
encoder.fit_transform(training_data[['service']])
from feature_engine.encoding import StringSimilarityEncoder
encoder = StringSimilarityEncoder()
encoder.fit_transform(training_data[['protocol_type']])
import category_encoders as ce
encoder = ce.target_encoder.TargetEncoder()
encoder.fit_transform(training_data[['service']], train_y)
from feature_engine.encoding import MeanEncoder
encoder = MeanEncoder()
encoder.fit_transform(training_data[['service']], train_y)
import category_encoders as ce
encoder = ce.QuantileEncoder()
encoder.fit_transform(training_data[['service']], train_y)
import category_encoders as ce
encoder = ce.m_estimate.MEstimateEncoder()
encoder.fit_transform(training_data[['service']], train_y)
from feature_engine.encoding import WoEEncoder
encoder = WoEEncoder()
encoder.fit_transform(training_data[['service']], train_y)
from feature_engine.encoding import DecisionTreeEncoder
encoder = DecisionTreeEncoder(cv=3)
encoder.fit_transform(training_data[['service']], train_y)
import category_encoders as ce
encoder = ce.leave_one_out.LeaveOneOutEncoder()
encoder.fit_transform(training_data[['service']], train_y)
import category_encoders as ce
encoder = ce.cat_boost.CatBoostEncoder()
encoder.fit_transform(training_data[['service']], train_y)
import category_encoders as ce
encoder = ce.james_stein.JamesSteinEncoder()
encoder.fit_transform(training_data[['service']], train_y)
# 1. Convert IP to a binary number
# Convert each of the integer of an IP to an 8-bit integer, then merge them together to be a 32-bit integer
# Maintains this relationship between IP addresses, as the system would be able to learn that IP addresses that
# are close to each other can be within the same network
ip = '192.168.0.1'
parts = ip.split('.')
ip_1 = int(parts[0])<<24
ip_2 = int(parts[1])<<16
ip_3 = int(parts[2])<<8
ip_4 = int(parts[3])
ip_converted = ip_1+ip_2+ip_3+ip_4
ip_converted
# 2. Convert IP to an integer by removing dots
# '192.168.0.1' to 192168000001
# Maintains the relationship between IP addresses and helps the recognition of adjacent nodes and networks
ip = '192.168.0.1'
parts = ip.split('.')
ip_converted = '{:03d}'.format(int(parts[0]))+'{:03d}'.format(int(parts[1]))+\
'{:03d}'.format(int(parts[2]))+'{:03d}'.format(int(parts[3]))
ip_converted = int(ip_converted)
ip_converted
# 3. One-hot encoding
# The relationship is not maintained between IP addresses
# Be helpful in recognizing data flows, and doing statistics analysis of data originating or
# landing in a specific node
# 4. One-hot-encoding the top frequent IPs