CTU13 is a labeled dataset provided by the MFCP.
The pcap files were processed using stratosphere behavioral models
library(dplyr)
library(ggplot2)
library(lattice)
library(plotly)
ctu13=readr::read_csv(file="https://www.dropbox.com/s/4eabsw82z9d4u5q/ctu-13.labeled?dl=1")
A new field is added to simplify fitering by the class [normal,botnet]
ctu13=ctu13 %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet"))
ctu13 %>% group_by(class) %>% summarise(total=n())
## # A tibble: 2 × 2
## class total
## <chr> <int>
## 1 botnet 65987
## 2 normal 5412
Some cleaning in the labels is performed for a better aggregation
# some text cleaning for aggregating labels
ctu13=ctu13 %>% mutate(label=gsub("From-|To-","",label)) %>%
mutate(label=gsub("-[0-9]+$","",label)) %>%
mutate(label=gsub("-V[0-9]+(-[0-9]+)?","",label)) %>%
mutate(label=gsub("-[0-9]+$","",label)) %>%
mutate(label=gsub("Botnet-TCP-Attempt$","Botnet-TCP-Attempt-SPAM",label)) %>%
mutate(label=gsub("CC[0-9]+-","CC-",label)) %>%
mutate(modelsize=nchar(State))
library(scales)
histogram(~modelsize|class,data=ctu13,breaks = 10000,groups=class,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100)",as.table=T)
As observed, the number of models with very few states are a significant part of the dataset. Since the stratosphere behavioral model needs at least 4 flows for calculating the letter value correctly, all the models with less than 5 letters should be removed, since they wont provide any usefull info in terms of behavioral analisys.
If we replot after removing those models, the frequency distribution is:
ctu13 = ctu13 %>% filter(modelsize>4)
histogram(~modelsize|class,data=ctu13,breaks = 10000,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100) for",color='red')
As can be seen, the number of conections with few states remains larger in the case of Botnet connections. This situation should be analyzed since it could be a strong bias for some algorithms.
After removing those episodes with less than 5 flows:
print(
ctu13 %>% group_by(class) %>% summarise(total=n()))
## # A tibble: 2 × 2
## class total
## <chr> <int>
## 1 botnet 6274
## 2 normal 2790
After removing those episodes with less than 5 flows:
ctu13_labels=ctu13 %>% group_by(label) %>% summarise(total=n(),avg_modelsize=mean(modelsize))
ctu13_labels=ctu13_labels %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet")) %>% arrange(desc(total))
print(ctu13_labels)
## # A tibble: 41 × 4
## label total avg_modelsize class
## <chr> <int> <dbl> <chr>
## 1 Botnet-TCP-Attempt-SPAM 4338 40.39949 botnet
## 2 Normal-Stribrek 1009 403.97027 normal
## 3 Normal-Jist 913 65.80832 normal
## 4 Normal-Grill 768 71.85807 normal
## 5 Botnet-TCP-Established-HTTP-Ad 487 38.42710 botnet
## 6 Botnet-TCP-WEB-Established 378 16.71429 botnet
## 7 Botnet-UDP-DNS 312 148.11859 botnet
## 8 Botnet-TCP-WEB-Established-SSL 210 18.54286 botnet
## 9 Botnet-UDP-Attempt-DNS 74 507.24324 botnet
## 10 Botnet-TCP-CC-HTTP-Not-Encrypted 71 84.46479 botnet
## # ... with 31 more rows
#ctu13_labels=ctu13_labels %>% filter(label!="Botnet-TCP-Attempt" & label!="Botnet-TCP-Attempt-SPAM")
size=ggplot(ctu13_labels,aes(x=label,y=total))+
geom_jitter(aes(size=total,alpha=0.5,colour=class))+
guides(colour=FALSE,size=FALSE,alpha=F)+
scale_size_continuous(range = c(2.5,20))+
ylab("Total Number of Episodes")+xlab("Label")+
ggtitle("Number of Episodes per Label Type")+
theme_classic()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(size)
ep=ggplot(ctu13_labels,aes(x=avg_modelsize,y=total))+
geom_jitter(aes(size=total,alpha=0.5,colour=class,text=label))+
guides(colour=FALSE,size=FALSE,alpha=F)+
scale_size_continuous(range = c(2.5,20))+
ylab("Total Number of Episodes")+xlab("Model Size [AVG]")+
theme_classic()+
ggtitle("Number of Episodes per Avg Model size ")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(ep)
readr::write_csv(ctu13,"/home/harpo/Dropbox/ongoing-work/git-repos/stratosphere-deep/datasets/ctu-13/ctu-13.labeled.cleaned")
The final cleaned version of the dataset can be found here