Xiangzhu Long (xiangzhl)
During the last decase, the attack in the network has attracted the attention of many researchers. To prevent these unfriendly visitors, we need to build a network intrusion detector, which can distinguish the ‘’bad’‘connections, and’‘good’’ connections.
This database(KDD Cup 1999 Data) contains a standard set of data to be audited, which includes a wide variety of intrusions simulated in a military network environment.
Attacks fall into four main categories:
| Variable | Description | Example ———–|————-|————- | DOS | denial-of-service | syn flood | R2L | unauthorized access from a remote machine | guessing password | U2R | unauthorized access to local superuser privileges | ‘’buffer overflow’’ attacks | probing | surveillance and other probing | port scanning
# Clear the workspace
rm( list = ls() )
# Load the data
train_raw <- read.csv("data/traindata_10percent.csv", stringsAsFactors = FALSE)
# Process the data
colnames <- read.table("data/names", skip = 1, sep = ":")
names(train_raw) <- colnames$V1
d <- dim(train_raw)
names(train_raw)[d[2]] <- "label"
# Observe the data
names(train_raw)
[1] “duration” “protocol_type”
[3] “service” “flag”
[5] “src_bytes” “dst_bytes”
[7] “land” “wrong_fragment”
[9] “urgent” “hot”
[11] “num_failed_logins” “logged_in”
[13] “num_compromised” “root_shell”
[15] “su_attempted” “num_root”
[17] “num_file_creations” “num_shells”
[19] “num_access_files” “num_outbound_cmds”
[21] “is_host_login” “is_guest_login”
[23] “count” “srv_count”
[25] “serror_rate” “srv_serror_rate”
[27] “rerror_rate” “srv_rerror_rate”
[29] “same_srv_rate” “diff_srv_rate”
[31] “srv_diff_host_rate” “dst_host_count”
[33] “dst_host_srv_count” “dst_host_same_srv_rate”
[35] “dst_host_diff_srv_rate” “dst_host_same_src_port_rate” [37] “dst_host_srv_diff_host_rate” “dst_host_serror_rate”
[39] “dst_host_srv_serror_rate” “dst_host_rerror_rate”
[41] “dst_host_srv_rerror_rate” “label”
We can see the 10% kddcup training data has 494020 observations, each data has 41 features and the last one is its label.
# Observe the distribution of labels.
sum_label <- aggregate(rep(1, d[1]),
by = list(train_raw$label),
FUN = sum)
names(sum_label) <- c("label", "count")
barplot(beside = TRUE, log10(sum_label$count),
names.arg = sum_label$label, ylim = c(0,6),
xlab = "Label", ylab = "log(Count)",
col = "Blue", main = "The distribution of labels")
# Select the features
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
# Process the feature with NA value.
l <- train_raw$label
sum(is.na(l))
## [1] 0
There is no NA value, which makes our work more simple.
# Clean up near zero variance features
nzvcol <- nearZeroVar(train_raw)
train_raw <- train_raw[, -nzvcol]
#label into factor
training <- train_raw
training$label <- factor(training$label)
d <- dim(training)
After preprocessing, the 10% training data has 494020 observations and 19 features.
# Load the data
test_raw <- read.csv("data/testdata_10percent.csv", stringsAsFactors = FALSE)
# Process the data
names(test_raw) <- colnames$V1
names(test_raw)[dim(test_raw)[2]] <- "label"
# Extract the same features as training data
colnames_train <- names(training)
test_raw <- test_raw[ , colnames_train]
testing <- test_raw
testing$label <- as.factor(testing$label)
library(e1071)
# Build the model
label_result = training[ ,d[2]]
training_data = training[ ,1:(d[2]-1)]
navie_bayes_tree_model = naiveBayes(as.factor(label_result)~.,
training_data)
# Predict the testing
testing_data = testing[ , 1: (d[2]-1)]
navie_bayes_pred = predict(navie_bayes_tree_model, testing_data)
## Warning in data.matrix(newdata): NAs introduced by coercion
## Warning in data.matrix(newdata): NAs introduced by coercion
## Warning in data.matrix(newdata): NAs introduced by coercion
golden_answer = testing[ , d[2]]
navie_bayes_pred = factor(navie_bayes_pred, levels =levels(golden_answer))
# Get the accuracy
NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = TRUE)
The accuracy of Naive Bayes is 0.6156518.
# Random Forest
library(randomForest)
library(doParallel)
#random seed
set.seed(3433)
#parallel computing for multi-core
registerDoParallel(makeCluster(detectCores()))
model_rf <- train(label ~ ., method = "rf", data = training)
# Decesion Tree
library(rpart)
decision_tree_model <- rpart(label ~ ., data = training, method = "class")
# Predicting:
decision_tree_pred <- predict(decision_tree_model, testing_data, type = "class")
# Plot of the Decision Tree
rpart.plot(decision_tree_model, main = "Classification Tree",
extra = 102, under = TRUE, faclen = 0)
# Test results on our subTesting data set:
confusionMatrix(prediction1, subTesting$classe)
Actually, it was running without stop. Thus, there is no successful model.
After make deeper observation and then refer to the article A Detailed Analysis of the KDD CUP 99 Data Set, There exists two issues in the “KDD CUP 99 Data Set”:
1.There are huge number of redundant records.
2.The prediction accuracy is unbelievably high.
# Redundant records in training set
d_train <- dim(train_raw)
d_uniqe_train <- dim(unique(train_raw))
d_train_percent <- (d_train[1] - d_uniqe_train[1]) / d_train[1]
In the 10 percent of training dataset, there are 494020 observations, but the number of distinct observations are 110192, which means there are 0.7769483 redundant observations.
# Redundant records in testing set
d_test <- dim(test_raw)
d_uniqe_test <- dim(unique(test_raw))
d_test_percent <- (d_test[1] - d_uniqe_test[1]) / d_test[1]
In the 10 percent of testing dataset, there are 311028 observations, but the number of distinct observations are 64873, which means there are 0.7914239 redundant observations.
# Clear the workspace
rm( list = ls() )
# Load the data
train_raw <- read.csv("data/traindata_full.csv", stringsAsFactors = FALSE)
# Process the data
colnames <- read.table("data/names", skip = 1, sep = ":")
names(train_raw) <- colnames$V1
d <- dim(train_raw)
names(train_raw)[d[2]] <- "label"
We can see the full kddcup training data has 4898430 observations.
# Observe the distribution of labels.
sum_label <- aggregate(rep(1, d[1]),
by = list(train_raw$label),
FUN = sum)
names(sum_label) <- c("label", "count")
barplot(beside = TRUE, log10(sum_label$count),
names.arg = sum_label$label, ylim = c(0,6),
xlab = "Label", ylab = "log(Count)",
col = "Blue", main = "The distribution of labels")
library(caret)
# Clean up near zero variance features
nzvcol <- nearZeroVar(train_raw)
train_raw <- train_raw[, -nzvcol]
# Delete the duplicate data in the training set.
training_engineer <- unique(train_raw)
d_unique <- dim(training_engineer)
d_percent <- (d[1] - d_unique[1]) / d[1]
After removing the duplicate data, the new training dataset contains 587351 observations, which means there are 0.880094 duplicate training data.
d <- d_unique
# Make numeralization.
numeralize <- function(col) {
# Find the classification of this column.
char <- unique(col)
# Change the type to its corresponding type.
for (i in 1: length(char)) {
col <- replace(col, col == char[i], i)
}
col
}
# Precess these colums with character class.
training_engineer <- within(training_engineer, {
protocol_type <- numeralize(protocol_type)
service <- numeralize(service)
flag <- numeralize(flag)
label <- numeralize(label)
} )
summary(training_engineer)
## protocol_type service flag
## Length:587351 Length:587351 Length:587351
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## src_bytes logged_in count srv_count
## Min. :0.00e+00 Min. :0.0000 Min. : 0.0 Min. : 0.00
## 1st Qu.:0.00e+00 1st Qu.:0.0000 1st Qu.: 2.0 1st Qu.: 2.00
## Median :2.25e+02 Median :1.0000 Median : 9.0 Median : 9.00
## Mean :1.01e+04 Mean :0.6342 Mean : 56.1 Mean : 15.41
## 3rd Qu.:3.28e+02 3rd Qu.:1.0000 3rd Qu.: 73.0 3rd Qu.: 18.00
## Max. :1.38e+09 Max. :1.0000 Max. :511.0 Max. :511.00
## serror_rate srv_serror_rate rerror_rate srv_rerror_rate
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.1924 Mean :0.1936 Mean :0.08551 Mean :0.0851
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## diff_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.02000 Median :0.01000
## Mean :0.04111 Mean :0.05219 Mean :0.07992
## 3rd Qu.:0.05000 3rd Qu.:0.07000 3rd Qu.:0.04000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1933 Mean :0.1924 Mean :0.08902
## 3rd Qu.:0.0100 3rd Qu.:0.0100 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## dst_host_srv_rerror_rate label
## Min. :0.00000 Length:587351
## 1st Qu.:0.00000 Class :character
## Median :0.00000 Mode :character
## Mean :0.08919
## 3rd Qu.:0.00000
## Max. :1.00000
The new training dataset contains 587351 observations and every column is numeric class.
# Load the data
test_raw <- read.csv("data/testdata_10percent.csv", stringsAsFactors = FALSE)
# Process the data
names(test_raw) <- colnames$V1
names(test_raw)[dim(test_raw)[2]] <- "label"
# Extract the same features as training data
colnames_train <- names(training_engineer)
test_raw <- test_raw[ , colnames_train]
# Implement the numeralization feature engineering on testing dataset.
# Note: we could not delete the duplicate data in testing set.
testing_engineer <- within(test_raw, {
protocol_type <- numeralize(protocol_type)
service <- numeralize(service)
flag <- numeralize(flag)
label <- numeralize(label)
} )
# Build the model
label_result = training_engineer[ ,d[2]]
training_data = training_engineer[ ,1:(d[2]-1)]
navie_bayes_tree_model = naiveBayes(label_result~.,
training_data)
# Predict the testing
testing_data = testing_engineer[, 1: (d[2]-1)]
navie_bayes_pred = predict(navie_bayes_tree_model, testing_data)
golden_answer = testing_engineer[, d[2]]
navie_bayes_pred = factor(navie_bayes_pred, levels =levels(golden_answer))
# Get the accuracy
NB_accuracy <- mean(golden_answer == navie_bayes_pred,na.rm = FALSE)
The accuracy is 91.6%, which is pretty optimal.
There is still lots more to work on.
1.Implement cross validation to make better prediction of data.
2.Take the prediction and call into consideration to better evaluate the model.