Datasets are part of the StratoshpereIPS projects (http://www.stratosphereips.org). The format of the dataset is explained in (https://stratosphereips.org/stratosphere-ips-generation-of-the-behavioral-models.html)
3 Datasets are used: (available at github)
setwd("/home/harpo/Dropbox/ongoing-work/git-repos/labeling-datasets/")
#lees el archivo
datasetfull=read.csv(file="./data/characteristic_vector_22_all.txt",header=F)
datasetfull=cbind(rep(22),datasetfull)
names(datasetfull)<-c("id","sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","length","class","subclass")
dataset=read.csv(file="./data/characteristic_vector_45_all.txt",header=F)
dataset=cbind(rep(45),dataset)
names(dataset)=c("id","sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","length","class","subclass")
datasetfull=rbind(datasetfull,dataset)
dataset=read.csv(file="./data/characteristic_vector_87_all.txt",header=F)
dataset=cbind(rep(87),dataset)
names(dataset)=c("id","sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","length","class","subclass")
datasetfull=rbind(datasetfull,dataset)
datasetfull$class=factor(datasetfull$class)
datasetfull$subclass=factor(datasetfull$subclass)
datasetfull$id=factor(datasetfull$id)
dataset_raw=read.csv(file="./data/dataset22_raw",header=F,sep="|",stringsAsFactors = F)
For a given Stratosphere connection \(N\): we create a feature vector \(V(X_1,X_2,...,X_{10})\) where each \(X_i\) calculate a value for the following features in \(F(SP,WP,WNP,SNP,DS,DM,DL,SS,SL,SM)\). Then each \(X_i=\frac{\sum(letters\in F_i) * 100}{|N|}/100\). The following python code is used for generating the features.
In summary, a Stratosphere connection like:
** 66.f.W.Z.Z.Z.Z.W.W*W.W.W.w.Z.z.W., From-Normal-TCP-HTTP-WebPage-13 **
is represented as:
22, 0.300885, 0.1415929, 0.0353982, 0.0088496, 0, 0, 0.5044248, 0, 0.2477876, 0.2566372, 113, Normal, TCP-HTTP-WebPage
splom(~datasetfull[,2:12],data=datasetfull,
groups=datasetfull$class,
diag.panel = function(x, ...){
yrng <- current.panel.limits()$ylim
d <- density(x, na.rm=TRUE)
d$y <- with(d, yrng[1] + 0.95 * diff(yrng) * y / max(y) )
panel.lines(d)
diag.panel.splom(x,...)
},cex=0.5,xlab="",ylab="",auto.key = TRUE,cex.labels=0.1,pscales = 0,alpha=0.5,varname.cex=0.8
)
histogram(~class|id,datasetfull)
histogram(~subclass|id,data=datasetfull,groups=class,col=c("skyblue","pink"),scales=list(x=list(rot=90),cex=0.7),layout=c(1,1),auto.key = F)
densityplot(~length|id,data=datasetfull,groups=class,auto.key=T,scales=list(x=list(rot=45,relation='free',log=10),y=list(relation='free')),layout=c(1,1),breaks=5)
barchart(length~factor(subclass)|id,groups=class,data=datasetfull,auto.key=T,scales=list(x=list(rot=90),y=list(relation='free'),cex=0.70),layout=c(1,1))
#datasetfull=datasetfull %>% filter(length>5)
train=datasetfull %>% filter(id==22) %>% filter(length>5)
train <- upSample(x = train, y = train$subclass, yname="class")
# removing dataset Id,subclass, and class.1 (added by upSample)
train <- train[,-c(1,14,15)]
ctrl_fast <- trainControl(method="cv",
repeats=1,
number=2,
summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=TRUE,
allowParallel = TRUE)
# Random Forest
rfFit <- train(class ~ .,
data = train,
metric="ROC",
method = "rf",
trControl = ctrl_fast)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
rfFit
## Random Forest
##
## 27897 samples
## 11 predictors
## 2 classes: 'Botnet', 'Normal'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 13949, 13948
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9998221 0.9999374 0.9969834
## 6 0.9995034 0.9995615 0.9976538
## 11 0.9993167 0.9991230 0.9983241
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rfFit$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.09%
## Confusion matrix:
## Botnet Normal class.error
## Botnet 15962 1 6.264487e-05
## Normal 23 11911 1.927267e-03
test45=datasetfull %>% filter(id==45)
# removing dataset Id and subclass
test45 <- test45[,-c(1,14)]
predsrfprobs=predict(rfFit,test45,type='prob')
predsrf=ifelse(predsrfprobs$Botnet >0.5,'Botnet','Normal')
confusionMatrix(predsrf,test45$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Botnet Normal
## Botnet 1589 18
## Normal 96 110
##
## Accuracy : 0.9371
## 95% CI : (0.9249, 0.9479)
## No Information Rate : 0.9294
## P-Value [Acc > NIR] : 0.1065
##
## Kappa : 0.6261
## Mcnemar's Test P-Value : 5.525e-13
##
## Sensitivity : 0.9430
## Specificity : 0.8594
## Pos Pred Value : 0.9888
## Neg Pred Value : 0.5340
## Prevalence : 0.9294
## Detection Rate : 0.8764
## Detection Prevalence : 0.8864
## Balanced Accuracy : 0.9012
##
## 'Positive' Class : Botnet
##
subclasses=cbind(datasetfull %>% filter(id==45) %>% select(subclass,class),predsrf,predsrfprobs)
fp=subclasses %>% filter(class=='Normal' & predsrf=='Botnet')# %>% select(subclass,Botnet,Normal)
histogram(~subclass|predsrf,data=fp,scales=list(x=list(rot=45)),type='count')
fn=subclasses %>% filter(class=='Botnet' & predsrf=='Normal') #%>% select(subclass,Botnet,Normal)
histogram(~subclass|predsrf,data=fn,scales=list(x=list(rot=45)),type='count',col='orange')
test87=datasetfull %>% filter(id==87)
# removing dataset Id and subclass
test87 <- test87[,-c(1,14)]
predsrfprobs=predict(rfFit,test87,type='prob')
bwplot(~Botnet,data=predsrfprobs)
bwplot(~Normal,data=predsrfprobs)
predsrf=ifelse(predsrfprobs$Botnet >0.5,'Botnet','Normal')
print(caret::confusionMatrix(predsrf,test87$class))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Botnet Normal
## Botnet 175 57
## Normal 18 258
##
## Accuracy : 0.8524
## 95% CI : (0.8185, 0.8821)
## No Information Rate : 0.6201
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6985
## Mcnemar's Test P-Value : 1.145e-05
##
## Sensitivity : 0.9067
## Specificity : 0.8190
## Pos Pred Value : 0.7543
## Neg Pred Value : 0.9348
## Prevalence : 0.3799
## Detection Rate : 0.3445
## Detection Prevalence : 0.4567
## Balanced Accuracy : 0.8629
##
## 'Positive' Class : Botnet
##
subclasses=cbind(datasetfull %>% filter(id==87) %>% select(subclass,class),predsrf,predsrfprobs)
fp=subclasses %>% filter(class=='Normal' & predsrf=='Botnet')# %>% select(subclass,Botnet,Normal)
histogram(~subclass|predsrf,data=fp,scales=list(x=list(rot=45)),type='count')
fn=subclasses %>% filter(class=='Botnet' & predsrf=='Normal') #%>% select(subclass,Botnet,Normal)
histogram(~subclass|predsrf,data=fn,scales=list(x=list(rot=45)),type='count',col='orange')