library(tidyverse)
library(caret)
library(doMC)
library(pROC)
library(plotROC)
registerDoMC(cores=6)
summary(clients_profiles)
client tot_requests tot_detected ratio_detected tot_nx ratio_nx tot_mx ratio_mx tot_fail ratio_fail
Length:317029 Min. : 1.0 Min. : 0.000 Min. :0.00000 Min. : 0.000 Min. :0.00000 Min. : 0.000 Min. :0.0000000 Min. : 0.00 Min. :0.000000
Class :character 1st Qu.: 2.0 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.: 0.000 1st Qu.:0.0000000 1st Qu.: 0.00 1st Qu.:0.000000
Mode :character Median : 7.0 Median : 0.000 Median :0.00000 Median : 0.000 Median :0.00000 Median : 0.000 Median :0.0000000 Median : 0.00 Median :0.000000
Mean : 59.5 Mean : 2.287 Mean :0.02771 Mean : 2.607 Mean :0.01785 Mean : 0.555 Mean :0.0003607 Mean : 3.31 Mean :0.008496
3rd Qu.: 21.0 3rd Qu.: 0.000 3rd Qu.:0.00000 3rd Qu.: 0.000 3rd Qu.:0.00000 3rd Qu.: 0.000 3rd Qu.:0.0000000 3rd Qu.: 0.00 3rd Qu.:0.000000
Max. :69478.0 Max. :3778.000 Max. :1.00000 Max. :6610.000 Max. :1.00000 Max. :13025.000 Max. :1.0000000 Max. :37697.00 Max. :1.000000
tot_reverse ratio_reverse max_sameip tot_sameip avg_sameip sd_sameip tot_samedomain max_samedomain avg_samedomain sd_samedomain
Min. : 0.000 Min. :0.000000 Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. : 0.0000 Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.0000
1st Qu.: 0.000 1st Qu.:0.000000 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000
Median : 0.000 Median :0.000000 Median : 0.000 Median : 0.00 Median : 0.0000 Median : 0.0000 Median : 2.00 Median : 2.000 Median : 2.000 Median : 0.0000
Mean : 1.479 Mean :0.007915 Mean : 1.821 Mean : 12.27 Mean : 0.7488 Mean : 0.3274 Mean : 23.09 Mean : 3.837 Mean : 2.511 Mean : 0.8280
3rd Qu.: 0.000 3rd Qu.:0.000000 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 9.00 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 0.7071
Max. :7208.000 Max. :1.000000 Max. :7717.000 Max. :26646.00 Max. :823.0000 Max. :1208.1432 Max. :35662.00 Max. :736.000 Max. :290.000 Max. :241.1234
ratio_tot_samedomain ratio_max_samedomain ratio_avg_samedomain ratio_sd_samedomain ratio_tot_sameip ratio_max_sameip ratio_avg_sameip ratio_sd_sameip profilenum
Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. : 0.00
1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.: 43.00
Median :0.2308 Median :0.08511 Median :0.0513 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.000000 Median : 95.00
Mean :0.3378 Mean :0.23709 Mean :0.2114 Mean :0.02446 Mean :0.05056 Mean :0.02769 Mean :0.02517 Mean :0.002077 Mean : 93.57
3rd Qu.:0.6667 3rd Qu.:0.38462 3rd Qu.:0.3030 3rd Qu.:0.01003 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:144.00
Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :0.66451 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :0.681006 Max. :192.00
currenttm
Min. :2017-07-31 20:18:07
1st Qu.:2017-08-02 20:18:07
Median :2017-08-04 21:18:07
Mean :2017-08-04 22:43:53
3rd Qu.:2017-08-07 01:18:07
Max. :2017-08-08 20:18:07
Reduce the number of False Positive generated by the NN classifier by using a client profile
A profile is considered a false positive if it contains more than one detection from NN and was labeled as normal.
profiles_labeled<- profiles_fixed %>% filter(Label!="") %>% mutate(FP=ifelse(Label=="Normal" & tot_detected>=1,"yes","no"))
profiles_labeled<- profiles_labeled %>% filter(Label!="") %>% mutate(normal=ifelse(grepl("Normal",Label),"yes","no"))
profiles_labeled %>% select(client,Label,FP,tot_requests,tot_detected,ratio_detected,ratio_nx,ratio_mx,ratio_reverse,ratio_tot_samedomain,ratio_tot_sameip,ratio_avg_samedomain,ratio_avg_sameip) %>%
group_by(FP) %>% summarise(n=n())
profiles_labeled %>% filter(tot_detected>1) %>% group_by(normal,Label) %>% summarise(n=n())
ctrl_fast <- trainControl(method="cv",
repeats=2,
number=5,
summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=TRUE,
allowParallel = TRUE)
set.seed(100121)
trainset<-profiles_labeled %>% filter(tot_detected>1)
trainset$FP<-as.factor(trainset$FP)
trainIndex <- createDataPartition(trainset$FP, p=0.70, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <- trainset[-trainIndex,]
train_formula<-formula(FP~tot_requests+ratio_nx+ratio_mx+ratio_reverse+ratio_fail+ratio_tot_samedomain+ratio_tot_sameip+ratio_avg_samedomain+ratio_avg_sameip)
data_train %>% group_by(FP) %>% summarise(total=n())
data_test %>% group_by(FP) %>% summarise(total=n())
rfFit<- train(train_formula,
data = data_train,
# method = "svmRadialWeights", # Radial kernel
method = "rf",
tuneLength = 3,
#tuneGrid = svmGrid,
#preProcess=c("scale","center"),
metric="ROC",
#weights = model_weights,
trControl = ctrl_fast)
+ Fold1: mtry=2
- Fold1: mtry=2
+ Fold1: mtry=5
- Fold1: mtry=5
+ Fold1: mtry=9
- Fold1: mtry=9
+ Fold2: mtry=2
- Fold2: mtry=2
+ Fold2: mtry=5
- Fold2: mtry=5
+ Fold2: mtry=9
- Fold2: mtry=9
+ Fold3: mtry=2
- Fold3: mtry=2
+ Fold3: mtry=5
- Fold3: mtry=5
+ Fold3: mtry=9
- Fold3: mtry=9
+ Fold4: mtry=2
- Fold4: mtry=2
+ Fold4: mtry=5
- Fold4: mtry=5
+ Fold4: mtry=9
- Fold4: mtry=9
+ Fold5: mtry=2
- Fold5: mtry=2
+ Fold5: mtry=5
- Fold5: mtry=5
+ Fold5: mtry=9
- Fold5: mtry=9
Aggregating results
Selecting tuning parameters
Fitting mtry = 2 on full training set
rfFit
Random Forest
158 samples
9 predictors
2 classes: 'no', 'yes'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 126, 126, 127, 127, 126
Resampling results across tuning parameters:
mtry ROC Sens Spec
2 0.9370915 0.8285714 0.9535948
5 0.9351074 0.8285714 0.9313725
9 0.9315593 0.8285714 0.8967320
ROC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
predsrfprobsamp=predict(rfFit,data_test,type='prob')
predsrfsamp=ifelse(predsrfprobsamp$yes >=0.5,'yes','no')
confusionMatrix(predsrfsamp,data_test$FP,positive="yes")
Confusion Matrix and Statistics
Reference
Prediction no yes
no 26 1
yes 4 36
Accuracy : 0.9254
95% CI : (0.8344, 0.9753)
No Information Rate : 0.5522
P-Value [Acc > NIR] : 1.976e-11
Kappa : 0.8477
Mcnemar's Test P-Value : 0.3711
Sensitivity : 0.9730
Specificity : 0.8667
Pos Pred Value : 0.9000
Neg Pred Value : 0.9630
Prevalence : 0.5522
Detection Rate : 0.5373
Detection Prevalence : 0.5970
Balanced Accuracy : 0.9198
'Positive' Class : yes
#plot(roc(data_test$FP,predsrfprobsamp$yes))
ggplot(cbind(predsrfprobsamp,class=data_test$FP),
aes(m = yes, d = factor(class, labels=c("yes","no"),levels = c("yes", "no")))) +
geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') +
theme_bw()
varImp(rfFit, scale = F)
rf variable importance
Overall
tot_requests 15.536
ratio_avg_samedomain 15.058
ratio_nx 9.774
ratio_fail 9.633
ratio_reverse 8.435
ratio_tot_samedomain 5.089
ratio_tot_sameip 4.853
ratio_avg_sameip 3.921
ratio_mx 2.098
data_test_predic<-cbind(data_test,FPpreds=predsrfsamp)
fp_incorrect<-data_test_predic %>% filter(FP=='yes' & FPpreds=='no')
fp_correct<-data_test_predic %>% filter(FP=='yes' & FPpreds=='yes')
histogram(~ratio_avg_samedomain,data=fp_correct,main="Distribution of Avg_samedomain for False positives correctly detected")
histogram(~ratio_avg_samedomain,data=fp_incorrect,main="Distribution of Avg_samedomain for False positives incorrectly detected")
histogram(~tot_requests,data=fp_correct,main="Distribution of Total Requests for False positives correctly detected")
histogram(~tot_requests,data=fp_incorrect,main="Distribution of Total Requests for False positives incorrectly detected")
data_test_predic %>% filter(FP=='no' & FPpreds=='yes')
write_csv(data_test,"/home/harpo/Dropbox/profiles_testset.csv")
profilemodel<-randomForest(train_formula,data=trainset,mtry = 2)
profilepredictions<-predict(profilemodel,trainset)
confusionMatrix(trainset$FP,profilepredictions,positive = 'yes')
Confusion Matrix and Statistics
Reference
Prediction no yes
no 98 2
yes 0 125
Accuracy : 0.9911
95% CI : (0.9683, 0.9989)
No Information Rate : 0.5644
P-Value [Acc > NIR] : <2e-16
Kappa : 0.982
Mcnemar's Test P-Value : 0.4795
Sensitivity : 0.9843
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 0.9800
Prevalence : 0.5644
Detection Rate : 0.5556
Detection Prevalence : 0.5556
Balanced Accuracy : 0.9921
'Positive' Class : yes
save(file = "~/Dropbox/ongoing-work/git-repos/dga-wb/profile-classificator/model.rda",profilemodel)
getwd()
[1] "/home/harpo/Dropbox/ongoing-work/git-repos/dga-algos/reports"