library(dplyr)
library(readr)
library(caret)
library(doMC)
features_data<-list()
for (capture in list.files("data/labeled/",pattern = "capture2011081[012345689](-[123])?.binetflow.labels",full.names = T)){
capture_df <- readr::read_csv(capture)
features_data<-rbind(features_data,capture_df)
}
features_data <-features_data %>% mutate(label=ifelse(label=="infected","botnet","normal"))
features_data$label <- features_data$label %>% as.factor()
features_data
features_data_reduced<-features_cluster %>% filter(cluster != begnin_cluster)
readr::write_csv(file="/home/harpo/Dropbox/graph-reatures-reduced.csv", features_data_reduced)
train_index<-caret::createDataPartition(features_data_reduced$label,p=0.8,list = F)
trainset <- features_data_reduced[train_index,]
testset <- features_data_reduced[-train_index,]
trainset %>% group_by(label) %>% count()
testset %>% group_by(label) %>% count()
We are going to use a 10-folds cross validation.
registerDoMC(cores = 8)
ctrl_fast <- trainControl(
method = "cv",
number = 10,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = T,
sampling = "up" # UPSAMPLING
)
tgrid <- expand.grid(
mtry = 6,
splitrule = "gini",
min.node.size = 10)
bot_formula<-formula(label ~ ID+OD+IDW+ODW+BC+LCC+AC) # features used for training
rfFit <- caret::train(bot_formula,
data = trainset,
method = "ranger",
#tuneGrid = tgrid,
#metric = 'Kappa',
metric = "Sens",
tuneLength = 10,
verbose = 2,
trControl = ctrl_fast,
num.trees = 100,
max.depth = 6,
#class.weights = c(746.75,1),
)
note: only 6 unique complexity parameters in default grid. Truncating the grid to 6 .
rfFit
Random Forest
14955 samples
7 predictor
2 classes: 'botnet', 'normal'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 13459, 13460, 13460, 13459, 13459, 13460, ...
Addtional sampling using up-sampling
Resampling results across tuning parameters:
mtry splitrule ROC Sens Spec
2 gini 0.9755126 0.55 0.9758952
2 extratrees 0.9746867 0.70 0.9794430
3 gini 0.9748760 0.60 0.9716770
3 extratrees 0.9814846 0.75 0.9797122
4 gini 0.9514487 0.65 0.9696682
4 extratrees 0.9768130 0.70 0.9809175
5 gini 0.9047122 0.65 0.9691327
5 extratrees 0.9797253 0.80 0.9800472
6 gini 0.9312846 0.65 0.9685299
6 extratrees 0.9761082 0.80 0.9793760
7 gini 0.8846495 0.65 0.9686638
7 extratrees 0.9808463 0.70 0.9841308
Tuning parameter 'min.node.size' was held constant at a value of 1
Sens was used to select the optimal model using the largest value.
The final values used for the model were mtry = 5, splitrule = extratrees and min.node.size = 1.
rfFit$pred %>% group_by(Resample) %>%
yardstick::bal_accuracy(obs,pred) %>%
summarise(bal_acc=mean(.estimate),bal_acc_sd=sd(.estimate))
preds<-predict(rfFit,testset)
confusionMatrix(reference=testset$label,
data=preds %>% as.factor(),mode = 'everything')
Confusion Matrix and Statistics
Reference
Prediction botnet normal
botnet 4 93
normal 1 3640
Accuracy : 0.9749
95% CI : (0.9693, 0.9796)
No Information Rate : 0.9987
P-Value [Acc > NIR] : 1
Kappa : 0.0761
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.800000
Specificity : 0.975087
Pos Pred Value : 0.041237
Neg Pred Value : 0.999725
Precision : 0.041237
Recall : 0.800000
F1 : 0.078431
Prevalence : 0.001338
Detection Rate : 0.001070
Detection Prevalence : 0.025950
Balanced Accuracy : 0.887544
'Positive' Class : botnet