knitr::opts_chunk$set(echo = TRUE)
Background
The dataset to be audited was provided which consists of a wide variety of intrusions simulated in a military network environment. It created an environment to acquire raw TCP/IP dump data for a network by simulating a typical US Air Force LAN. The LAN was focused like a real environment and blasted with multiple attacks. A connection is a sequence of TCP packets starting and ending at some time duration between which data flows to and from a source IP address to a target IP address under some well-defined protocol. Also, each connection is labeled as either normal or as an attack with exactly one specific attack type. Each connection record consists of about 100 bytes.
For each TCP/IP connection, 41 quantitative and qualitative features are obtained from normal and attack data (3 qualitative and 38 quantitative features).
The class variable has two categories:
Normal
Anomalous
Following three methods are useful when Binary Logistic Regression and ANN Fail to do the classification on large data set as both these methods use Sigmoid fucntion that is generating the probability either 0 or 1 and not between the 0 and 1
IDS_CaseStudy=read.csv("D:/PGDA Files/R/Case Studies/IDS Case Study/Intrusion Detection System.csv",header = T)
sum(is.na(IDS_CaseStudy))
## [1] 0
head(IDS_CaseStudy)
## duration protocol_type service flag src_bytes dst_bytes land
## 1 0 tcp ftp_data SF 491 0 0
## 2 0 udp other SF 146 0 0
## 3 0 tcp private S0 0 0 0
## 4 0 tcp http SF 232 8153 0
## 5 0 tcp http SF 199 420 0
## 6 0 tcp private REJ 0 0 0
## wrong_fragment urgent hot num_failed_logins logged_in num_compromised
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 1 0
## 5 0 0 0 0 1 0
## 6 0 0 0 0 0 0
## root_shell su_attempted num_root num_file_creations num_shells
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## num_access_files num_outbound_cmds is_host_login is_guest_login count
## 1 0 0 0 0 2
## 2 0 0 0 0 13
## 3 0 0 0 0 123
## 4 0 0 0 0 5
## 5 0 0 0 0 30
## 6 0 0 0 0 121
## srv_count serror_rate srv_serror_rate rerror_rate srv_rerror_rate
## 1 2 0.0 0.0 0 0
## 2 1 0.0 0.0 0 0
## 3 6 1.0 1.0 0 0
## 4 5 0.2 0.2 0 0
## 5 32 0.0 0.0 0 0
## 6 19 0.0 0.0 1 1
## same_srv_rate diff_srv_rate srv_diff_host_rate dst_host_count
## 1 1.00 0.00 0.00 150
## 2 0.08 0.15 0.00 255
## 3 0.05 0.07 0.00 255
## 4 1.00 0.00 0.00 30
## 5 1.00 0.00 0.09 255
## 6 0.16 0.06 0.00 255
## dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate
## 1 25 0.17 0.03
## 2 1 0.00 0.60
## 3 26 0.10 0.05
## 4 255 1.00 0.00
## 5 255 1.00 0.00
## 6 19 0.07 0.07
## dst_host_same_src_port_rate dst_host_srv_diff_host_rate
## 1 0.17 0.00
## 2 0.88 0.00
## 3 0.00 0.00
## 4 0.03 0.04
## 5 0.00 0.00
## 6 0.00 0.00
## dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate
## 1 0.00 0.00 0.05
## 2 0.00 0.00 0.00
## 3 1.00 1.00 0.00
## 4 0.03 0.01 0.00
## 5 0.00 0.00 0.00
## 6 0.00 0.00 1.00
## dst_host_srv_rerror_rate class
## 1 0.00 normal
## 2 0.00 normal
## 3 0.00 anomaly
## 4 0.01 normal
## 5 0.00 normal
## 6 1.00 anomaly
str(IDS_CaseStudy)
## 'data.frame': 25192 obs. of 42 variables:
## $ duration : int 0 0 0 0 0 0 0 0 0 0 ...
## $ protocol_type : Factor w/ 3 levels "icmp","tcp","udp": 2 3 2 2 2 2 2 2 2 2 ...
## $ service : Factor w/ 66 levels "auth","bgp","courier",..: 17 40 45 20 20 45 45 45 47 45 ...
## $ flag : Factor w/ 11 levels "OTH","REJ","RSTO",..: 10 10 6 10 10 2 6 6 6 6 ...
## $ src_bytes : int 491 146 0 232 199 0 0 0 0 0 ...
## $ dst_bytes : int 0 0 0 8153 420 0 0 0 0 0 ...
## $ land : int 0 0 0 0 0 0 0 0 0 0 ...
## $ wrong_fragment : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urgent : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hot : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_failed_logins : int 0 0 0 0 0 0 0 0 0 0 ...
## $ logged_in : int 0 0 0 1 1 0 0 0 0 0 ...
## $ num_compromised : int 0 0 0 0 0 0 0 0 0 0 ...
## $ root_shell : int 0 0 0 0 0 0 0 0 0 0 ...
## $ su_attempted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_root : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_file_creations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_shells : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_access_files : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_outbound_cmds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ is_host_login : int 0 0 0 0 0 0 0 0 0 0 ...
## $ is_guest_login : int 0 0 0 0 0 0 0 0 0 0 ...
## $ count : int 2 13 123 5 30 121 166 117 270 133 ...
## $ srv_count : int 2 1 6 5 32 19 9 16 23 8 ...
## $ serror_rate : num 0 0 1 0.2 0 0 1 1 1 1 ...
## $ srv_serror_rate : num 0 0 1 0.2 0 0 1 1 1 1 ...
## $ rerror_rate : num 0 0 0 0 0 1 0 0 0 0 ...
## $ srv_rerror_rate : num 0 0 0 0 0 1 0 0 0 0 ...
## $ same_srv_rate : num 1 0.08 0.05 1 1 0.16 0.05 0.14 0.09 0.06 ...
## $ diff_srv_rate : num 0 0.15 0.07 0 0 0.06 0.06 0.06 0.05 0.06 ...
## $ srv_diff_host_rate : num 0 0 0 0 0.09 0 0 0 0 0 ...
## $ dst_host_count : int 150 255 255 30 255 255 255 255 255 255 ...
## $ dst_host_srv_count : int 25 1 26 255 255 19 9 15 23 13 ...
## $ dst_host_same_srv_rate : num 0.17 0 0.1 1 1 0.07 0.04 0.06 0.09 0.05 ...
## $ dst_host_diff_srv_rate : num 0.03 0.6 0.05 0 0 0.07 0.05 0.07 0.05 0.06 ...
## $ dst_host_same_src_port_rate: num 0.17 0.88 0 0.03 0 0 0 0 0 0 ...
## $ dst_host_srv_diff_host_rate: num 0 0 0 0.04 0 0 0 0 0 0 ...
## $ dst_host_serror_rate : num 0 0 1 0.03 0 0 1 1 1 1 ...
## $ dst_host_srv_serror_rate : num 0 0 1 0.01 0 0 1 1 1 1 ...
## $ dst_host_rerror_rate : num 0.05 0 0 0 0 1 0 0 0 0 ...
## $ dst_host_srv_rerror_rate : num 0 0 0 0.01 0 1 0 0 0 0 ...
## $ class : Factor w/ 2 levels "anomaly","normal": 2 2 1 2 2 1 1 1 1 1 ...
summary(IDS_CaseStudy)
## duration protocol_type service flag
## Min. : 0.0 icmp: 1655 http :8003 SF :14973
## 1st Qu.: 0.0 tcp :20526 private :4351 S0 : 7009
## Median : 0.0 udp : 3011 domain_u:1820 REJ : 2216
## Mean : 305.1 smtp :1449 RSTR : 497
## 3rd Qu.: 0.0 ftp_data:1396 RSTO : 304
## Max. :42862.0 eco_i : 909 S1 : 88
## (Other) :7264 (Other): 105
## src_bytes dst_bytes land
## Min. : 0 Min. : 0 Min. :0.00e+00
## 1st Qu.: 0 1st Qu.: 0 1st Qu.:0.00e+00
## Median : 44 Median : 0 Median :0.00e+00
## Mean : 24331 Mean : 3492 Mean :7.94e-05
## 3rd Qu.: 279 3rd Qu.: 530 3rd Qu.:0.00e+00
## Max. :381709090 Max. :5151385 Max. :1.00e+00
##
## wrong_fragment urgent hot num_failed_logins
## Min. :0.00000 Min. :0.00e+00 Min. : 0.000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.00e+00 1st Qu.: 0.000 1st Qu.:0.000000
## Median :0.00000 Median :0.00e+00 Median : 0.000 Median :0.000000
## Mean :0.02374 Mean :3.97e-05 Mean : 0.198 Mean :0.001191
## 3rd Qu.:0.00000 3rd Qu.:0.00e+00 3rd Qu.: 0.000 3rd Qu.:0.000000
## Max. :3.00000 Max. :1.00e+00 Max. :77.000 Max. :4.000000
##
## logged_in num_compromised root_shell su_attempted
## Min. :0.0000 Min. : 0.0000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.0000 Median : 0.0000 Median :0.000000 Median :0.00000
## Mean :0.3948 Mean : 0.2279 Mean :0.001548 Mean :0.00135
## 3rd Qu.:1.0000 3rd Qu.: 0.0000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.0000 Max. :884.0000 Max. :1.000000 Max. :2.00000
##
## num_root num_file_creations num_shells
## Min. : 0.0000 Min. : 0.00000 Min. :0.0000000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.:0.0000000
## Median : 0.0000 Median : 0.00000 Median :0.0000000
## Mean : 0.2498 Mean : 0.01473 Mean :0.0003573
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.:0.0000000
## Max. :975.0000 Max. :40.00000 Max. :1.0000000
##
## num_access_files num_outbound_cmds is_host_login is_guest_login
## Min. :0.000000 Min. :0 Min. :0 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0 1st Qu.:0 1st Qu.:0.00000
## Median :0.000000 Median :0 Median :0 Median :0.00000
## Mean :0.004327 Mean :0 Mean :0 Mean :0.00913
## 3rd Qu.:0.000000 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0.00000
## Max. :8.000000 Max. :0 Max. :0 Max. :1.00000
##
## count srv_count serror_rate srv_serror_rate
## Min. : 1.00 Min. : 1.0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 2.00 1st Qu.: 2.0 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 14.00 Median : 8.0 Median :0.0000 Median :0.0000
## Mean : 84.59 Mean : 27.7 Mean :0.2863 Mean :0.2838
## 3rd Qu.:144.00 3rd Qu.: 18.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :511.00 Max. :511.0 Max. :1.0000 Max. :1.0000
##
## rerror_rate srv_rerror_rate same_srv_rate diff_srv_rate
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0900 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :0.00000
## Mean :0.1186 Mean :0.1203 Mean :0.6606 Mean :0.06236
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.06000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## srv_diff_host_rate dst_host_count dst_host_srv_count
## Min. :0.00000 Min. : 0.0 Min. : 0.0
## 1st Qu.:0.00000 1st Qu.: 84.0 1st Qu.: 10.0
## Median :0.00000 Median :255.0 Median : 61.0
## Mean :0.09593 Mean :182.5 Mean :115.1
## 3rd Qu.:0.00000 3rd Qu.:255.0 3rd Qu.:255.0
## Max. :1.00000 Max. :255.0 Max. :255.0
##
## dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0500 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.5100 Median :0.03000 Median :0.0000
## Mean :0.5198 Mean :0.08254 Mean :0.1475
## 3rd Qu.:1.0000 3rd Qu.:0.07000 3rd Qu.:0.0600
## Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.03184 Mean :0.2858 Mean :0.2798
## 3rd Qu.:0.02000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## dst_host_rerror_rate dst_host_srv_rerror_rate class
## Min. :0.0000 Min. :0.0000 anomaly:11743
## 1st Qu.:0.0000 1st Qu.:0.0000 normal :13449
## Median :0.0000 Median :0.0000
## Mean :0.1178 Mean :0.1188
## 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
##
IDS_CaseStudy$protocol_type=factor(IDS_CaseStudy$protocol_type)
IDS_CaseStudy$service=factor(IDS_CaseStudy$service)
IDS_CaseStudy$flag=factor(IDS_CaseStudy$flag)
IDS_CaseStudy$land=factor(IDS_CaseStudy$land)
IDS_CaseStudy$logged_in=factor(IDS_CaseStudy$logged_in)
IDS_CaseStudy$is_host_login=NULL
IDS_CaseStudy$num_outbound_cmds=NULL
dim(IDS_CaseStudy)
## [1] 25192 40
normalize=function(x){
return((x-min(x))/(max(x)-min(x)))
}
Normalized_data=normalize(dplyr::select_if(IDS_CaseStudy, is.numeric))
Normalized_data$class=IDS_CaseStudy$class
Case_study_Index=sample(nrow(Normalized_data),0.75*nrow(Normalized_data))
train=Normalized_data[Case_study_Index,]
test=Normalized_data[-Case_study_Index,]
It will generate the probability equal to 0 or 1 and will be unable to asign weights for obtaining activation function.
Similar results will be generated for Logistic Regression as well since it will be unable to calculate the probability values within the range the range of 0 and 1 as one of the significant variable makes the model to assign probability eith equal to 0 or 1.
*Thus, we are using following three models
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
RF_model_IDS=randomForest(class~.,data = train,ntree=10)
plot(RF_model_IDS)
pred_train=predict(RF_model_IDS,train)
confusionMatrix(train$class,pred_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction anomaly normal
## anomaly 8794 3
## normal 11 10086
##
## Accuracy : 0.9993
## 95% CI : (0.9988, 0.9996)
## No Information Rate : 0.534
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9985
##
## Mcnemar's Test P-Value : 0.06137
##
## Sensitivity : 0.9988
## Specificity : 0.9997
## Pos Pred Value : 0.9997
## Neg Pred Value : 0.9989
## Prevalence : 0.4660
## Detection Rate : 0.4654
## Detection Prevalence : 0.4656
## Balanced Accuracy : 0.9992
##
## 'Positive' Class : anomaly
##
pred_test=predict(RF_model_IDS,test)
confusionMatrix(test$class,pred_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction anomaly normal
## anomaly 2926 20
## normal 12 3340
##
## Accuracy : 0.9949
## 95% CI : (0.9928, 0.9965)
## No Information Rate : 0.5335
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9898
##
## Mcnemar's Test P-Value : 0.2159
##
## Sensitivity : 0.9959
## Specificity : 0.9940
## Pos Pred Value : 0.9932
## Neg Pred Value : 0.9964
## Prevalence : 0.4665
## Detection Rate : 0.4646
## Detection Prevalence : 0.4678
## Balanced Accuracy : 0.9950
##
## 'Positive' Class : anomaly
##
#Naive Bayes
naive_model_IDS=naiveBayes(class~.,data = train)
predi_train_naive=predict(naive_model_IDS,train)
predi_test_naive=predict(naive_model_IDS,test)
confusionMatrix(train$class,predi_train_naive)
## Confusion Matrix and Statistics
##
## Reference
## Prediction anomaly normal
## anomaly 8338 459
## normal 3106 6991
##
## Accuracy : 0.8113
## 95% CI : (0.8057, 0.8169)
## No Information Rate : 0.6057
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.628
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7286
## Specificity : 0.9384
## Pos Pred Value : 0.9478
## Neg Pred Value : 0.6924
## Prevalence : 0.6057
## Detection Rate : 0.4413
## Detection Prevalence : 0.4656
## Balanced Accuracy : 0.8335
##
## 'Positive' Class : anomaly
##
confusionMatrix(test$class,predi_test_naive)
## Confusion Matrix and Statistics
##
## Reference
## Prediction anomaly normal
## anomaly 2824 122
## normal 1046 2306
##
## Accuracy : 0.8145
## 95% CI : (0.8047, 0.8241)
## No Information Rate : 0.6145
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6345
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7297
## Specificity : 0.9498
## Pos Pred Value : 0.9586
## Neg Pred Value : 0.6879
## Prevalence : 0.6145
## Detection Rate : 0.4484
## Detection Prevalence : 0.4678
## Balanced Accuracy : 0.8397
##
## 'Positive' Class : anomaly
##
#SVM
svm_casestudy=svm(class~.,data = train,kernel="linear",scale = F)
predict_svm_casestudy=predict(svm_casestudy,train)
confusionMatrix(train$class,predict_svm_casestudy)
## Confusion Matrix and Statistics
##
## Reference
## Prediction anomaly normal
## anomaly 0 8797
## normal 0 10097
##
## Accuracy : 0.5344
## 95% CI : (0.5273, 0.5415)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : NA
## Specificity : 0.5344
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 0.0000
## Detection Rate : 0.0000
## Detection Prevalence : 0.4656
## Balanced Accuracy : NA
##
## 'Positive' Class : anomaly
##
predict_svm_casestudy_train=predict(svm_casestudy,test)
confusionMatrix(test$class,predict_svm_casestudy_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction anomaly normal
## anomaly 0 2946
## normal 0 3352
##
## Accuracy : 0.5322
## 95% CI : (0.5198, 0.5446)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : NA
## Specificity : 0.5322
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 0.0000
## Detection Rate : 0.0000
## Detection Prevalence : 0.4678
## Balanced Accuracy : NA
##
## 'Positive' Class : anomaly
##
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.