save.image("final.RData")
load("final.RData")
流程圖
knitr::include_graphics('asset/flow_chart.png')

載入train跟test資料集
train <- read.delim("NSL-KDD_R2/NSL-KDDTrain+.txt", header=FALSE, sep=",")
test <- read.delim("NSL-KDD_R2/NSL-KDDTest+.txt", header=FALSE, sep=",")
split training vtesting
set.seed(2020)
trainIndex=createDataPartition(train$V42, p=0.8)$Resample1
data_train = train[trainIndex, ]
data_test = train[-trainIndex, ]
## check the balance
print(table(data_train$V42))
mlp model
mlp_model <- mlp(data_train[,1:41], data_train[,42], size=10, maxit=200,
initFunc = "Randomize_Weights", initFuncParams = c(-0.3, 0.3),
learnFunc = "Std_Backpropagation", learnFuncParams = c(0.3, 0),
updateFunc = "Topological_Order", updateFuncParams = c(0),
inputsTest=data_test[,1:41], targetsTest=data_test[,42])
mlp_train_predict <- predict(mlp_model, data_train[,1:41])
mlp_test_predict <- predict(mlp_model, data_test[,1:41])
par(mfrow=c(2,2))
plotIterativeError(mlp_model)

threshold
optCutOff <- optimalCutoff(actuals = data_test[,42], predictedScores = mlp_test_predict)[1]
optCutOff
## [1] 0.35
confusion matrix / accuracy
training set
#train confussion matrix
mlp_train_predict <- ifelse(mlp_train_predict > optCutOff, 1, 0)
trainTable <- table(data_train[,42], mlp_train_predict)
trainTable
## mlp_train_predict
## 0 1
## 0 52334 1561
## 1 5098 41785
#train accuracy
sum(diag(trainTable))/sum(trainTable)
## [1] 0.9339241
testing set
#test confussion matrix
mlp_test_predict <- ifelse(mlp_test_predict > optCutOff, 1, 0)
testTable <- table(data_test[,42], mlp_test_predict)
testTable
## mlp_test_predict
## 0 1
## 0 13069 379
## 1 1294 10453
#test accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9335979
預測結果
predictions <- predict(mlp_model, test, type='class')
mlp_predict <- ifelse(predictions[,1] > optCutOff, 1, 0)
mlp_predict <- mlp_predict %>% as.character() %>% as.numeric()
mlp_predict <- paste(mlp_predict, collapse = '')
cat(mlp_predict, file="mlp_predict.txt")
knitr::include_graphics('asset/mlp.png')

Naive Bayes
library(e1071)
建立模型
nb_model <- naiveBayes(as.factor(V42) ~ ., data = data_train)
nb_train_predict <- predict(nb_model, data_train, type = 'class')
nb_test_predict <- predict(nb_model, data_test, type = 'class')
confusion matrix / accuracy
training set
#train confussion matrix
trainTable <- table(data_train[,42], nb_train_predict)
trainTable
## nb_train_predict
## 0 1
## 0 47507 6388
## 1 5217 41666
#train accuracy
sum(diag(trainTable))/sum(trainTable)
## [1] 0.8848459
testing set
#test confussion matrix
testTable <- table(data_test[,42], nb_test_predict)
testTable
## nb_test_predict
## 0 1
## 0 11872 1576
## 1 1261 10486
#test accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.8873983
預測結果
nb_predict <- predict(nb_model, test, type = 'class')
nb_predict <- nb_predict %>% as.character() %>% as.numeric()
nb_predict <- paste(nb_predict, collapse = '')
cat(nb_predict, file="nb_predict.txt")
knitr::include_graphics('asset/nb.png')

SVM
library(e1071)
建立模型
svm_model <- svm(as.factor(V42) ~ ., data = data_train, kernel = "radial", cost = 9, scale = FALSE)
#svm_model_radial <- svm_model
svm_model <- svm_model_radial
svm_train_predict <- predict(svm_model, data_train, type = 'class')
svm_test_predict <- predict(svm_model, data_test, type = 'class')
confusion matrix / accuracy
training set
#train confussion matrix
trainTable <- table(data_train[,42], svm_train_predict)
trainTable
## svm_train_predict
## 0 1
## 0 53886 9
## 1 6 46877
#train accuracy
sum(diag(trainTable))/sum(trainTable)
## [1] 0.9998512
testing set
#test confussion matrix
testTable <- table(data_test[,42], svm_test_predict)
testTable
## svm_test_predict
## 0 1
## 0 13439 9
## 1 363 11384
#test accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9852352
預測結果
svm_predict <- predict(svm_model, test, type = 'class')
svm_predict <- svm_predict %>% as.character() %>% as.numeric()
svm_predict <- paste(svm_predict, collapse = '')
cat(svm_predict, file="svm_predict.txt")
knitr::include_graphics('asset/svm.png')

decision tree
require(tree)
## Loading required package: tree
建立模型
dt_model <- tree(as.factor(V42) ~ ., data = data_train)
dt_train_predict <- predict(dt_model, data_train, type = 'class')
dt_test_predict <- predict(dt_model, data_test, type = 'class')
plot(dt_model)

confusion matrix / accuracy
training set
#train confussion matrix
trainTable <- table(data_train[,42], dt_train_predict)
trainTable
## dt_train_predict
## 0 1
## 0 52687 1208
## 1 692 46191
#train accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9852352
testing set
#test confussion matrix
testTable <- table(data_test[,42], dt_test_predict)
testTable
## dt_test_predict
## 0 1
## 0 13136 312
## 1 173 11574
#test accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9807501
預測結果
dt_predict <- predict(dt_model, test, type = 'class')
dt_predict <- dt_predict %>% as.character() %>% as.numeric()
dt_predict <- paste(dt_predict, collapse = '')
cat(dt_predict, file="dt_predict.txt")
knitr::include_graphics('asset/dt.png')

random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
建立模型
rf_model <- randomForest(as.factor(V42) ~ ., data = data_train, ntree = 100, mtry=2, importance=TRUE)
rf_model
rf_train_predict <- predict(rf_model, data_train, type = 'class')
rf_test_predict <- predict(rf_model, data_test, type = 'class')
confusion matrix / accuracy
training set
#train confussion matrix
trainTable <- table(data_train[,42], rf_train_predict)
trainTable
## rf_train_predict
## 0 1
## 0 53729 166
## 1 506 46377
#train accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9807501
testing set
#test confussion matrix
testTable <- table(data_test[,42], rf_test_predict)
testTable
## rf_test_predict
## 0 1
## 0 13411 37
## 1 135 11612
#test accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9931732
預測結果
rf_predict <- predict(rf_model, test, type = 'class')
rf_predict <- rf_predict %>% as.character() %>% as.numeric()
rf_predict <- paste(rf_predict, collapse = '')
cat(rf_predict, file="rf_predict.txt")
knitr::include_graphics('asset/rf.png')

boosting tree
require(gbm)
## Loading required package: gbm
## Loaded gbm 2.1.5
建立模型
boost_model = gbm(V42 ~., data = data_train, distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 4)
summary(boost_model)

## var rel.inf
## V5 V5 6.507191e+01
## V6 V6 1.331176e+01
## V2 V2 4.663422e+00
## V23 V23 3.925766e+00
## V36 V36 3.156789e+00
## V33 V33 2.798507e+00
## V10 V10 1.399043e+00
## V37 V37 9.081747e-01
## V3 V3 7.842363e-01
## V34 V34 7.162909e-01
## V24 V24 4.667621e-01
## V1 V1 4.440366e-01
## V30 V30 3.249130e-01
## V4 V4 2.952838e-01
## V8 V8 2.561435e-01
## V32 V32 2.445494e-01
## V35 V35 2.081042e-01
## V40 V40 1.627644e-01
## V13 V13 1.481586e-01
## V38 V38 1.338889e-01
## V41 V41 8.654154e-02
## V29 V29 7.283400e-02
## V11 V11 6.700040e-02
## V12 V12 5.431903e-02
## V7 V7 4.908417e-02
## V27 V27 4.156999e-02
## V39 V39 3.656923e-02
## V22 V22 3.263884e-02
## V25 V25 3.189762e-02
## V26 V26 2.935084e-02
## V17 V17 2.733492e-02
## V16 V16 2.507761e-02
## V14 V14 7.859482e-03
## V28 V28 5.994308e-03
## V31 V31 4.634516e-03
## V18 V18 3.732513e-03
## V15 V15 2.133563e-03
## V19 V19 9.199135e-04
## V9 V9 0.000000e+00
## V20 V20 0.000000e+00
## V21 V21 0.000000e+00
plot(boost_model,i="V5")

plot(boost_model,i="V6")

plot(boost_model,i="V2")

plot(boost_model,i="V23")

plot(boost_model,i="V36")

n.trees = seq(from = 100, to = 10000, by = 100)
predmat = predict(boost_model, newdata = data_test, n.trees = n.trees)
boost.err = with(data_test, apply( (predmat - V42)^2, 2, mean) )
plot(n.trees, boost.err, pch = 23, ylab = "Mean Squared Error", xlab = "# Trees", main = "Boosting Test Error") + abline(h = min(boost.err), col = "red")

## integer(0)
boost_train_predict <- predict(boost_model, newdata = data_train, n.trees = max(n.trees), type = 'response')
boost_test_predict <- predict(boost_model, newdata = data_test, n.trees = max(n.trees), type = 'response')
optCutOff <- optimalCutoff(actuals = data_test[,42], predictedScores = boost_test_predict)[1]
optCutOff
confusion matrix / accuracy
training set
#threshold
boost_train_predict <- ifelse(boost_train_predict > optCutOff, 1, 0)
#train confussion matrix
trainTable <- table(data_train[,42], boost_train_predict)
trainTable
## boost_train_predict
## 0 1
## 0 53833 62
## 1 58 46825
#train accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9931732
testing set
#threshold
boost_test_predict <- ifelse(boost_test_predict > optCutOff, 1, 0)
#test confussion matrix
testTable <- table(data_test[,42], boost_test_predict)
testTable
## boost_test_predict
## 0 1
## 0 13424 24
## 1 15 11732
#test accuracy
sum(diag(testTable))/sum(testTable)
## [1] 0.9984521
預測結果
boost_predict <- predict(boost_model, newdata = test, n.trees = max(n.trees), type = 'response')
boost_predict <- ifelse(boost_predict > optCutOff, 1, 0)
boost_predict <- boost_predict %>% as.character() %>% as.numeric()
boost_predict <- paste(boost_predict, collapse = '')
cat(boost_predict, file="boost_predict.txt")
knitr::include_graphics('asset/boost.png')

KNN
library(class)
library(dbscan)
建立模型
knn_model <- knn(data_train, data_test, cl = data_train$V42, k = 13)
ACC_knn <- 100 * sum(data_test[,42] == knn_model)/NROW(data_test[,42])
ACC_knn
table(data_test[,42], knn_model)
confusion matrix / accuracy
training set
knn_train_predict <- knn(data_train, data_train, cl = data_train[,42], k = 9)
trainTable <- table(data_train[,42], knn_train_predict)
trainTable
## knn_train_predict
## 0 1
## 0 53636 259
## 1 196 46687
sum(diag(trainTable))/sum(trainTable)
## [1] 0.9954851
testing set
knn_test_predict <- knn(data_train, data_test, cl = data_train[,42], k = 9)
testTable <- table(data_test[,42], knn_test_predict)
testTable
## knn_test_predict
## 0 1
## 0 13373 75
## 1 46 11701
sum(diag(testTable))/sum(testTable)
## [1] 0.9951975
#i=1
#k.optm=1
#for (i in 1:28){
# knn.mod <- knn(train = data_train, test = data_test, cl = data_train[,42], k=i)
# k.optm[i] <- 100 * sum(data_test[,42] == knn.mod)/NROW(data_test[,42])
# k=i
# cat(k,'=',k.optm[i],'')
#}
預測結果
knn_predict <- knn(data_train[,1:41], test, cl = data_train[,42], k = 9)
knn_predict <- knn_predict %>% as.character() %>% as.numeric()
knn_predict <- paste(knn_predict, collapse = '')
cat(knn_predict, file="knn_predict_9.txt")
knitr::include_graphics('asset/knn9.png')
