library(FSelector)
## Warning: package 'FSelector' was built under R version 4.1.2
Test_Data <- read.csv(file = 'C:/SalmanDocuments/ValparaisoFallSemester2021/MachineLearning/Project/UNSW_NB15_training-set.csv')
Train_Data <- read.csv(file = 'C:/SalmanDocuments/ValparaisoFallSemester2021/MachineLearning/Project/UNSW_NB15_testing-set.csv')
summary(Train_Data)
## ï..ID dur proto service
## Min. : 1 Min. : 0.00000 Length:175341 Length:175341
## 1st Qu.: 43836 1st Qu.: 0.00001 Class :character Class :character
## Median : 87671 Median : 0.00158 Mode :character Mode :character
## Mean : 87671 Mean : 1.35939
## 3rd Qu.:131506 3rd Qu.: 0.66807
## Max. :175341 Max. :59.99999
## state spkts dpkts sbytes
## Length:175341 Min. : 1.0 Min. : 0.00 Min. : 28
## Class :character 1st Qu.: 2.0 1st Qu.: 0.00 1st Qu.: 114
## Mode :character Median : 2.0 Median : 2.00 Median : 430
## Mean : 20.3 Mean : 18.97 Mean : 8845
## 3rd Qu.: 12.0 3rd Qu.: 10.00 3rd Qu.: 1418
## Max. :9616.0 Max. :10974.00 Max. :12965233
## dbytes rate sttl dttl
## Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0 1st Qu.: 32.8 1st Qu.: 62.0 1st Qu.: 0.00
## Median : 164 Median : 3225.8 Median :254.0 Median : 29.00
## Mean : 14929 Mean : 95406.2 Mean :179.5 Mean : 79.61
## 3rd Qu.: 1102 3rd Qu.: 125000.0 3rd Qu.:254.0 3rd Qu.:252.00
## Max. :14655550 Max. :1000000.0 Max. :255.0 Max. :254.00
## sload dload sloss dloss
## Min. :0.000e+00 Min. : 0 Min. : 0.000 Min. : 0.000
## 1st Qu.:1.305e+04 1st Qu.: 0 1st Qu.: 0.000 1st Qu.: 0.000
## Median :8.797e+05 Median : 1447 Median : 0.000 Median : 0.000
## Mean :7.345e+07 Mean : 671206 Mean : 4.953 Mean : 6.948
## 3rd Qu.:8.889e+07 3rd Qu.: 27845 3rd Qu.: 3.000 3rd Qu.: 2.000
## Max. :5.988e+09 Max. :22422730 Max. :4803.000 Max. :5484.000
## sinpkt dinpkt sjit djit
## Min. : 0.00 Min. : 0.00 Min. : 0 Min. : 0.0
## 1st Qu.: 0.01 1st Qu.: 0.00 1st Qu.: 0 1st Qu.: 0.0
## Median : 0.28 Median : 0.01 Median : 0 Median : 0.0
## Mean : 985.98 Mean : 88.22 Mean : 4976 Mean : 604.4
## 3rd Qu.: 55.16 3rd Qu.: 51.05 3rd Qu.: 2513 3rd Qu.: 115.0
## Max. :84371.50 Max. :56716.82 Max. :1460480 Max. :289388.3
## swin stcpb dtcpb dwin
## Min. : 0.0 Min. :0.000e+00 Min. :0.000e+00 Min. : 0
## 1st Qu.: 0.0 1st Qu.:0.000e+00 1st Qu.:0.000e+00 1st Qu.: 0
## Median : 0.0 Median :0.000e+00 Median :0.000e+00 Median : 0
## Mean :116.3 Mean :9.693e+08 Mean :9.689e+08 Mean :115
## 3rd Qu.:255.0 3rd Qu.:1.917e+09 3rd Qu.:1.914e+09 3rd Qu.:255
## Max. :255.0 Max. :4.295e+09 Max. :4.295e+09 Max. :255
## tcprtt synack ackdat smean
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. : 28.0
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 57.0
## Median :0.00000 Median :0.00000 Median :0.00000 Median : 73.0
## Mean :0.04140 Mean :0.02102 Mean :0.02038 Mean : 136.8
## 3rd Qu.:0.06548 3rd Qu.:0.02327 3rd Qu.:0.03891 3rd Qu.: 100.0
## Max. :2.51889 Max. :2.10035 Max. :1.52088 Max. :1504.0
## dmean trans_depth response_body_len ct_srv_src
## Min. : 0.0 Min. : 0.000 Min. : 0 Min. : 1.000
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0 1st Qu.: 2.000
## Median : 44.0 Median : 0.000 Median : 0 Median : 5.000
## Mean : 124.2 Mean : 0.106 Mean : 2144 Mean : 9.306
## 3rd Qu.: 89.0 3rd Qu.: 0.000 3rd Qu.: 0 3rd Qu.:12.000
## Max. :1458.0 Max. :172.000 Max. :6558056 Max. :63.000
## ct_state_ttl ct_dst_ltm ct_src_dport_ltm ct_dst_sport_ltm
## Min. :0.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median :1.000 Median : 2.000 Median : 1.000 Median : 1.000
## Mean :1.304 Mean : 6.194 Mean : 5.384 Mean : 4.206
## 3rd Qu.:2.000 3rd Qu.: 7.000 3rd Qu.: 5.000 3rd Qu.: 3.000
## Max. :6.000 Max. :51.000 Max. :51.000 Max. :46.000
## ct_dst_src_ltm is_ftp_login ct_ftp_cmd ct_flw_http_mthd
## Min. : 1.00 Min. :0.00000 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 1.00 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median : 3.00 Median :0.00000 Median :0.00000 Median : 0.0000
## Mean : 8.73 Mean :0.01495 Mean :0.01495 Mean : 0.1331
## 3rd Qu.:12.00 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 0.0000
## Max. :65.00 Max. :4.00000 Max. :4.00000 Max. :30.0000
## ct_src_ltm ct_srv_dst is_sm_ips_ports attack_cat
## Min. : 1.000 Min. : 1.000 Min. :0.00000 Length:175341
## 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:0.00000 Class :character
## Median : 3.000 Median : 4.000 Median :0.00000 Mode :character
## Mean : 6.956 Mean : 9.101 Mean :0.01575
## 3rd Qu.: 9.000 3rd Qu.:12.000 3rd Qu.:0.00000
## Max. :60.000 Max. :62.000 Max. :1.00000
## label
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6806
## 3rd Qu.:1.0000
## Max. :1.0000
cm=table(Train_Data$label)
prop.table(table(Train_Data$label))
##
## 0 1
## 0.3193777 0.6806223
cm=table(Test_Data$label)
prop.table(table(Test_Data$label))
##
## 0 1
## 0.4494 0.5506
df_norm_test=Test_Data
df_norm_train= Train_Data
df_norm_train$proto <- factor(df_norm_train$proto)
df_norm_train$service <- factor(df_norm_train$service)
df_norm_train$state <- factor(df_norm_train$state)
df_norm_train$label <- factor(df_norm_train$label)
df_norm_train$is_sm_ips_ports <- factor(df_norm_train$is_sm_ips_ports)
df_norm_train$is_ftp_login <- factor(df_norm_train$is_ftp_login)
drop <- c("attack_cat","ï..ID") # Drop attack column
df_norm_train = df_norm_train[,!(names(df_norm_train) %in% drop)]
df_norm_train=df_norm_train[df_norm_train$proto != 'icmp', ]
df_norm_train=df_norm_train[df_norm_train$proto != 'rtp', ]
df_norm_train=df_norm_train[df_norm_train$is_ftp_login != 4, ]
df_norm_train$proto=droplevels(df_norm_train$proto)
df_norm_train$is_ftp_login=droplevels(df_norm_train$is_ftp_login)
df_norm_train$state <- factor(df_norm_train$state, levels=c("CON", "FIN","INT","REQ"))
df_norm_train=na.omit(df_norm_train)
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
#Process of Normalization
for (colName in names(df_norm_train)) {
# Check if the column contains numeric data.
if(class(df_norm_train[,colName]) == 'integer' | class(df_norm_train[,colName]) == 'numeric') {
# Scale this column (scale() function applies z-scaling).
df_norm_train[,colName] <- min_max_norm(df_norm_train[,colName])
}
}
df_norm_test$proto <- factor(df_norm_test$proto)
df_norm_test$service <- factor(df_norm_test$service)
df_norm_test$state <- factor(df_norm_test$state)
df_norm_test$label <- factor(df_norm_test$label)
df_norm_test$is_sm_ips_ports <- factor(df_norm_test$is_sm_ips_ports)
df_norm_test$is_ftp_login <- factor(df_norm_test$is_ftp_login)
drop <- c("attack_cat","ï..id") # Drop attack column
df_norm_test = df_norm_test[,!(names(df_norm_test) %in% drop)]
df_norm_test$state <- factor(df_norm_test$state, levels=c("CON", "FIN","INT","REQ"))
df_norm_test=na.omit(df_norm_test)
for (colName in names(df_norm_test)) {
# Check if the column contains numeric data.
if(class(df_norm_test[,colName]) == 'integer' | class(df_norm_test[,colName]) == 'numeric') {
# Scale this column (scale() function applies z-scaling).
df_norm_test[,colName] <- min_max_norm(df_norm_test[,colName])
}
}
The model is being prepared using logistic regression with predictors as dpkts , sbytes , dbytes , rate , synack , dur,state, service. These predictors are simply taken as provided in the project description
In order to save time redoing the normalization, the pre-processed train and test data are saved in the .Rdata file.
load(file = "C:\\SalmanDocuments\\ValparaisoFallSemester2021\\MachineLearning\\Project\\data.Rdata")
glm.fit = glm(label ~ dpkts + sbytes + dbytes + rate + synack + sqrt(dur)+state+ service
, data = df_norm_train, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_subset <- subset(df_norm_test, select = c(
dpkts , sbytes , dbytes , rate , synack , dur,state, service))
test_subset$dur =sqrt(test_subset$dur)
glm.probs = predict(glm.fit, test_subset, type = "response", na.action=na.pass )
glm.pred = rep(0,nrow(test_subset))
glm.pred[glm.probs > 0.5]=1
cm=table(glm.pred,df_norm_test$label)
print (cm)
##
## glm.pred 0 1
## 0 16738 406
## 1 20259 44923
correct = (cm[1,1] + cm[2,2])/(sum(cm))*100
print(paste0("Accuracy: " , correct))
## [1] "Accuracy: 74.8985739620533"
precision = (cm[2,2])/(cm[1,2]+cm[2,2])*100
print(paste0("Precision: " , precision))
## [1] "Precision: 99.104326148823"
Recall = (cm[2,2])/(cm[2,2]+cm[2,1])*100
print(paste0("Recall: " , Recall))
## [1] "Recall: 68.9193335583443"
F1_Score = (2*(precision)*(Recall))/(precision + Recall)
print(paste0("F1 Score: " , F1_Score))
## [1] "F1 Score: 81.300504022224"
fpr_fnr=((cm[1,2])/(cm[1,2]+cm[1,1])) + ((cm[2,1])/(cm[2,2]+cm[2,1]))
False_Alarm_Rate= 100* fpr_fnr/(2)
print(paste0("False Alarm Rate: " , False_Alarm_Rate))
## [1] "False Alarm Rate: 16.7244209483127"
The predictors are prepared by using feature subset selection using entropy based filter methods.It helps find the best set of attributes from a vast pool of features. Two entropy based algorithms considered are Information Gain and Gain Ratio.
Recursive feature elimination technique using random forest has also been used. However, due to large training data size, the results have been taking a lot of time. Therefore, this feature is not being used in this report. This link is utilized for reference: https://towardsdatascience.com/effective-feature-selection-recursive-feature-elimination-using-r-148ff998e4f7
train=df_norm_train
numcols <- sapply(train, is.numeric) # can be omitted if data is entirely numeric
train[numcols] <- train[numcols] * 1000
weights.ig <- information.gain(label~., train
, unit ="log2")
subset.ig <- cutoff.k(weights.ig, 5)
results.ig <- as.simple.formula(subset.ig, "label")
print(results.ig)
## label ~ sbytes + sttl + dbytes + dttl + ct_state_ttl
## <environment: 0x0000000061a14318>
weights.gr <- gain.ratio (label~.,train
, unit ="log2") # default log e
subset.gr <- cutoff.k(weights.gr, 5)
#subset.gr
results.gr <- as.simple.formula(subset.gr, "label")
print(results.gr)
## label ~ sttl + dttl + ct_state_ttl + is_sm_ips_ports + state
## <environment: 0x0000000019b65130>
glm.fit = glm(label ~ sbytes + sttl + dbytes + dttl + ct_state_ttl
, data = df_norm_train, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_subset <- subset(df_norm_test, select = c(
sbytes , sttl , dbytes , dttl , ct_state_ttl))
glm.probs = predict(glm.fit, test_subset, type = "response", na.action=na.pass )
glm.pred = rep(0,nrow(test_subset))
glm.pred[glm.probs > 0.5]=1
cm=table(glm.pred,df_norm_test$label)
print (cm)
##
## glm.pred 0 1
## 0 17820 79
## 1 19177 45250
correct = (cm[1,1] + cm[2,2])/(sum(cm))*100
print(paste0("Accuracy: ",correct))
## [1] "Accuracy: 76.6100624347108"
precision = (cm[2,2])/(cm[1,2]+cm[2,2])*100
print(paste0("Precision: " , precision))
## [1] "Precision: 99.8257186348695"
Recall = (cm[2,2])/(cm[2,2]+cm[2,1])*100
print(paste0("Recall: " , Recall))
## [1] "Recall: 70.234529001816"
F1_Score = (2*(precision)*(Recall))/(precision + Recall)
print(paste0("F1 Score: " , F1_Score))
## [1] "F1 Score: 82.4556288494478"
fpr_fnr=((cm[1,2])/(cm[1,2]+cm[1,1])) + ((cm[2,1])/(cm[2,2]+cm[2,1]))
False_Alarm_Rate= 100*fpr_fnr/(2)
print(paste0("False Alarm Rate: " , False_Alarm_Rate))
## [1] "False Alarm Rate: 15.103418218797"
glm.fit = glm(label ~ sttl + dttl + ct_state_ttl + is_sm_ips_ports + state
, data = df_norm_train, family=binomial)
test_subset <- subset(df_norm_test, select = c(sttl , dttl , ct_state_ttl , is_sm_ips_ports , state))
glm.probs = predict(glm.fit, test_subset, type = "response", na.action=na.pass )
glm.pred = rep(0,nrow(test_subset))
glm.pred[glm.probs > 0.5]=1
cm=table(glm.pred,df_norm_test$label)
print (cm)
##
## glm.pred 0 1
## 0 19654 332
## 1 17343 44997
correct = (cm[1,1] + cm[2,2])/(sum(cm))*100
print(paste0("Accuracy: ",correct))
## [1] "Accuracy: 78.5304763987076"
precision = (cm[2,2])/(cm[1,2]+cm[2,2])*100
print(paste0("Precision: " , precision))
## [1] "Precision: 99.267577047806"
Recall = (cm[2,2])/(cm[2,2]+cm[2,1])*100
print(paste0("Recall: " , Recall))
## [1] "Recall: 72.1799807507219"
F1_Score = (2*(precision)*(Recall))/(precision + Recall)
print(paste0("F1 Score: " , F1_Score))
## [1] "F1 Score: 83.5839470971217"
fpr_fnr=((cm[1,2])/(cm[1,2]+cm[1,1])) + ((cm[2,1])/(cm[2,2]+cm[2,1]))
False_Alarm_Rate= 100*fpr_fnr/(2)
print(paste0("False Alarm Rate: " , False_Alarm_Rate))
## [1] "False Alarm Rate: 14.740591031624"
glm.fit = glm(label ~ state+ dttl+ synack+ swin+ dwin+ct_state_ttl+ ct_src_ltm+ ct_srv_dst+ sttl+ ct_dst_sport_ltm + djit
, data = df_norm_train, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_subset <- subset(df_norm_test, select = c(state, dttl, synack, swin, dwin,ct_state_ttl, ct_src_ltm, ct_srv_dst,sttl,ct_dst_sport_ltm,djit
))
glm.probs = predict(glm.fit, test_subset, type = "response", na.action=na.pass )
glm.pred = rep(0,nrow(test_subset))
glm.pred[glm.probs > 0.5]=1
cm=table(glm.pred,df_norm_test$label)
print (cm)
##
## glm.pred 0 1
## 0 20793 225
## 1 16204 45104
correct = (cm[1,1] + cm[2,2])/(sum(cm))*100
print(paste0("Accuracy: ",correct))
## [1] "Accuracy: 80.0439715278284"
precision = (cm[2,2])/(cm[1,2]+cm[2,2])*100
print(paste0("Precision: " , precision))
## [1] "Precision: 99.5036290233625"
Recall = (cm[2,2])/(cm[2,2]+cm[2,1])*100
print(paste0("Recall: " , Recall))
## [1] "Recall: 73.569517844327"
F1_Score = (2*(precision)*(Recall))/(precision + Recall)
print(paste0("F1 Score: " , F1_Score))
## [1] "F1 Score: 84.5935275748568"
fpr_fnr=((cm[1,2])/(cm[1,2]+cm[1,1])) + ((cm[2,1])/(cm[2,2]+cm[2,1]))
False_Alarm_Rate= 100*fpr_fnr/(2)
print(paste0("False Alarm Rate: " , False_Alarm_Rate))
## [1] "False Alarm Rate: 13.7504965731263"
glm.fit = glm(label ~ sttl + ct_srv_dst + sbytes+ smean+ proto+ ct_state_ttl+sloss+synack+ct_dst_src_ltm+dmean+ct_srv_src+service+ct_dst_sport_ltm+dbytes+dloss+state+tcprtt+ct_src_dport_ltm+rate , data = df_norm_train, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_subset <- subset(df_norm_test, select = c(sttl , ct_srv_dst , sbytes, smean, proto, ct_state_ttl,sloss,synack,ct_dst_src_ltm,dmean,ct_srv_src,service,ct_dst_sport_ltm,dbytes,dloss,state,tcprtt,ct_src_dport_ltm,rate))
glm.probs = predict(glm.fit, test_subset, type = "response", na.action=na.pass )
glm.pred = rep(0,nrow(test_subset))
glm.pred[glm.probs > 0.5]=1
cm=table(glm.pred,df_norm_test$label)
print (cm)
##
## glm.pred 0 1
## 0 22204 850
## 1 14793 44479
correct = (cm[1,1] + cm[2,2])/(sum(cm))*100
print(paste0("Accuracy: ",correct))
## [1] "Accuracy: 80.9987124359255"
precision = (cm[2,2])/(cm[1,2]+cm[2,2])*100
print(paste0("Precision: " , precision))
## [1] "Precision: 98.1248207549251"
Recall = (cm[2,2])/(cm[2,2]+cm[2,1])*100
print(paste0("Recall: " , Recall))
## [1] "Recall: 75.0421784316372"
F1_Score = (2*(precision)*(Recall))/(precision + Recall)
print(paste0("F1 Score: " , F1_Score))
## [1] "F1 Score: 85.0450760508982"
fpr_fnr=((cm[1,2])/(cm[1,2]+cm[1,1])) + ((cm[2,1])/(cm[2,2]+cm[2,1]))
False_Alarm_Rate= 100*fpr_fnr/(2)
print(paste0("False Alarm Rate: " , False_Alarm_Rate))
## [1] "False Alarm Rate: 14.3224086587368"