CTU13 is a labeled dataset provided by the MFCP.

The pcap files were processed using stratosphere behavioral models

library(dplyr)
library(ggplot2)
library(lattice)
library(plotly)
ctu13=readr::read_csv(file="https://www.dropbox.com/s/4eabsw82z9d4u5q/ctu-13.labeled?dl=1")

A new field is added to simplify fitering by the class [normal,botnet]

ctu13=ctu13 %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet"))

Original Total Number of episodes per class (0=Normal, 1=Botnet)

ctu13 %>% group_by(class) %>% summarise(total=n())
## # A tibble: 2 × 2
##    class total
##    <chr> <int>
## 1 botnet 65987
## 2 normal  5412

Some cleaning in the labels is performed for a better aggregation

# some text cleaning for aggregating labels
ctu13=ctu13 %>% mutate(label=gsub("From-|To-","",label)) %>% 
  mutate(label=gsub("-[0-9]+$","",label)) %>% 
  mutate(label=gsub("-V[0-9]+(-[0-9]+)?","",label)) %>% 
  mutate(label=gsub("-[0-9]+$","",label)) %>% 
  mutate(label=gsub("Botnet-TCP-Attempt$","Botnet-TCP-Attempt-SPAM",label)) %>%
  mutate(label=gsub("CC[0-9]+-","CC-",label)) %>% 
  mutate(modelsize=nchar(State))
library(scales)
histogram(~modelsize|class,data=ctu13,breaks = 10000,groups=class,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100)",as.table=T)

As observed, the number of models with very few states are a significant part of the dataset. Since the stratosphere behavioral model needs at least 4 flows for calculating the letter value correctly, all the models with less than 5 letters should be removed, since they wont provide any usefull info in terms of behavioral analisys.

If we replot after removing those models, the frequency distribution is:

ctu13 = ctu13 %>% filter(modelsize>4)
histogram(~modelsize|class,data=ctu13,breaks = 10000,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100) for",color='red')

As can be seen, the number of conections with few states remains larger in the case of Botnet connections. This situation should be analyzed since it could be a strong bias for some algorithms.

Total Number of episodes per class (0=Normal, 1=Botnet)

After removing those episodes with less than 5 flows:

print(
ctu13 %>% group_by(class) %>% summarise(total=n()))
## # A tibble: 2 × 2
##    class total
##    <chr> <int>
## 1 botnet  6274
## 2 normal  2790

Total Number of episodes per label type

After removing those episodes with less than 5 flows:

ctu13_labels=ctu13 %>% group_by(label) %>% summarise(total=n(),avg_modelsize=mean(modelsize))
ctu13_labels=ctu13_labels %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet")) %>% arrange(desc(total))
print(ctu13_labels)
## # A tibble: 41 × 4
##                               label total avg_modelsize  class
##                               <chr> <int>         <dbl>  <chr>
## 1           Botnet-TCP-Attempt-SPAM  4338      40.39949 botnet
## 2                   Normal-Stribrek  1009     403.97027 normal
## 3                       Normal-Jist   913      65.80832 normal
## 4                      Normal-Grill   768      71.85807 normal
## 5    Botnet-TCP-Established-HTTP-Ad   487      38.42710 botnet
## 6        Botnet-TCP-WEB-Established   378      16.71429 botnet
## 7                    Botnet-UDP-DNS   312     148.11859 botnet
## 8    Botnet-TCP-WEB-Established-SSL   210      18.54286 botnet
## 9            Botnet-UDP-Attempt-DNS    74     507.24324 botnet
## 10 Botnet-TCP-CC-HTTP-Not-Encrypted    71      84.46479 botnet
## # ... with 31 more rows
#ctu13_labels=ctu13_labels %>% filter(label!="Botnet-TCP-Attempt"  & label!="Botnet-TCP-Attempt-SPAM")
size=ggplot(ctu13_labels,aes(x=label,y=total))+
  geom_jitter(aes(size=total,alpha=0.5,colour=class))+
     guides(colour=FALSE,size=FALSE,alpha=F)+
  scale_size_continuous(range = c(2.5,20))+
  ylab("Total Number of Episodes")+xlab("Label")+
  ggtitle("Number of Episodes per Label Type")+
  theme_classic()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(size)
ep=ggplot(ctu13_labels,aes(x=avg_modelsize,y=total))+
  geom_jitter(aes(size=total,alpha=0.5,colour=class,text=label))+
     guides(colour=FALSE,size=FALSE,alpha=F)+
  scale_size_continuous(range = c(2.5,20))+
  ylab("Total Number of Episodes")+xlab("Model Size [AVG]")+
  theme_classic()+
  ggtitle("Number of Episodes per Avg Model size ")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(ep)
readr::write_csv(ctu13,"/home/harpo/Dropbox/ongoing-work/git-repos/stratosphere-deep/datasets/ctu-13/ctu-13.labeled.cleaned")

The final cleaned version of the dataset can be found here