CTU13 is a labeled dataset provided by the MFCP.

The pcap files were processed using stratosphere behavioral models

library(dplyr)
library(ggplot2)
library(lattice)
library(plotly)
#ctu13=readr::read_delim(delim='|',file="https://www.dropbox.com/s/i34ngk4lm273g33/ctu13-labeled?dl=1")
ctu13=readr::read_delim(delim='|',file="/home/harpo/Dropbox/ongoing-work/git-repos/stratosphere-deep/datasets/ctu-13bis/ctu13-labeled")
ctu13
## # A tibble: 81,617 x 6
##              src            dst  port proto
##            <chr>          <chr> <int> <chr>
##  1   147.32.80.9  147.32.86.111 54230   udp
##  2   147.32.80.9  147.32.86.111 58314   udp
##  3   147.32.80.9  147.32.86.111 54823   udp
##  4 147.32.84.134    147.32.80.9    53   udp
##  5 147.32.84.134  88.86.100.176    80   tcp
##  6 147.32.84.164 74.125.232.216   443   tcp
##  7 147.32.84.170 74.125.232.206    80   tcp
##  8 147.32.84.170 209.85.149.101    80   tcp
##  9  147.32.84.15   82.208.56.89   123   udp
## 10 147.32.84.170    147.32.80.9    53   udp
## # ... with 81,607 more rows, and 2 more variables: State <chr>,
## #   label <chr>

A new field is added to simplify fitering by the class [normal,botnet]

ctu13=ctu13 %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet"))

Original Total Number of episodes per class (0=Normal, 1=Botnet)

ctu13 %>% group_by(class) %>% summarise(total=n())
## # A tibble: 2 x 2
##    class total
##    <chr> <int>
## 1 botnet 69199
## 2 normal 12418

Some cleaning in the labels is performed for a better aggregation

# some text cleaning for aggregating labels
ctu13=ctu13 %>% mutate(label=gsub("From-|To-","",label)) %>% 
  mutate(label=gsub("-[0-9]+$","",label)) %>% 
  mutate(capture=gsub(".*-(V[0-9]+)(-[0-9]+)?.*","\\1",label) ) %>% 
  mutate(label=gsub("-V[0-9]+(-[0-9]+)?","",label)) %>% 
  mutate(label=gsub("-[0-9]+$","",label)) %>% 
  mutate(label=gsub("Botnet-TCP-Attempt$","Botnet-TCP-Attempt-SPAM",label)) %>%
  mutate(label=gsub("Established-HTTP","HTTP-Established",label)) %>%
  mutate(label=gsub("WEB","HTTP",label)) %>%
  mutate(label=gsub("Botnet-TCP-Established-SSL-Microsoft","Botnet-TCP-HTTPS-Established-Microsoft",label)) %>%
  mutate(label=gsub("CC[0-9]+-","CC-",label)) %>% 
  mutate(label=gsub("Botnet-TCP-CC-HTTP","Botnet-TCP-HTTP-CC",label)) %>%
  mutate(label=gsub("TCP-CC-IRC","TCP-IRC-CC",label)) %>%
  mutate(label=gsub("Normal-CVUT-WebServer","Normal-TCP-HTTP-CVUT-WebServer",label)) %>%
  mutate(label=gsub("Botnet-TCP-Established-SPAM","Botnet-TCP-SMTP-Established-SPAM",label)) %>%
  mutate(label=gsub("Botnet-TCP-Attempt-SPAM","Botnet-TCP-SMTP-Attempt-SPAM",label)) %>%
  mutate(label=gsub("Normal-UDP-CVUT-DNS-Server","Normal-UDP-DNS-CVUT-DNSServer",label)) %>%
  mutate(label=gsub("Attempt-DNS","DNS-Attempt",label)) %>%
  mutate(label=gsub("Botnet-TCP-CC-Plain-HTTP-Encrypted-Data","Botnet-TCP-HTTP-CC-Plain-Encrypted-Data",label)) %>%
  #mutate(label=gsub("","",label)) %>%
  mutate(label=gsub("Botnet-TCP-Not-Encrypted-SMTP-Private-Proxy","Botnet-TCP-SMTP-Not-Encrypted-Private-Proxy",label)) %>%
  mutate(label=gsub("Normal-MatLab-Server","Normal-UDP-Matlab-server",label)) %>%
  
  
  mutate(modelsize=nchar(State))
head(ctu13)
## # A tibble: 6 x 9
##             src            dst  port proto
##           <chr>          <chr> <int> <chr>
## 1   147.32.80.9  147.32.86.111 54230   udp
## 2   147.32.80.9  147.32.86.111 58314   udp
## 3   147.32.80.9  147.32.86.111 54823   udp
## 4 147.32.84.134    147.32.80.9    53   udp
## 5 147.32.84.134  88.86.100.176    80   tcp
## 6 147.32.84.164 74.125.232.216   443   tcp
## # ... with 5 more variables: State <chr>, label <chr>, class <chr>,
## #   capture <chr>, modelsize <int>
ctu13 %>% filter(label=="Normal-Jist") %>% group_by(port,proto) %>% summarise(n=n())
## # A tibble: 21 x 3
## # Groups:   port [?]
##     port proto     n
##    <int> <chr> <int>
##  1    22   tcp     1
##  2    53   udp    12
##  3    80   tcp  1029
##  4   123   udp     5
##  5   443   tcp   507
##  6   465   tcp     3
##  7   843   tcp     1
##  8   993   tcp    22
##  9  1863   tcp     8
## 10  1935   tcp     1
## # ... with 11 more rows
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==80,"Normal-TCP-HTTP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==443,"Normal-TCP-HTTP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==53,"Normal-UDP-DNS")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==5222,"Normal-TCP-Jabber")) 

ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==80,"Normal-TCP-HTTP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==443,"Normal-TCP-HTTP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==53,"Normal-UDP-DNS")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==5222,"Normal-TCP-Jabber")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==27031,"Normal-TCP-Game")) 

ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==80,"Normal-TCP-HTTP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==443,"Normal-TCP-HTTP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==53,"Normal-UDP-DNS")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==5222,"Normal-TCP-Jabber")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==993,"Normal-TCP-IMAP")) 
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==1863,"Normal-TCP-MSN")) 
ctu13 %>% group_by(capture) %>% summarise(n=n())
## # A tibble: 13 x 2
##    capture     n
##      <chr> <int>
##  1     V42  5130
##  2     V43  1868
##  3     V44 27398
##  4     V45  1513
##  5     V46   510
##  6     V47  1908
##  7     V48   141
##  8     V49  6842
##  9     V50 27778
## 10     V51  3646
## 11     V52   555
## 12     V53  2143
## 13     V54  2185
ggplot(ctu13)+
  geom_bar(aes(x=class,fill=class))+
  theme_bw()+
  facet_wrap(~capture,scales = "free_y")

ctu13 %>% filter(capture=='V49') %>% group_by(port) %>% summarise(n=n())
## # A tibble: 586 x 2
##     port     n
##    <int> <int>
##  1    22     1
##  2    53     6
##  3    80   793
##  4    81     4
##  5    88     2
##  6   123    18
##  7   135   434
##  8   137     1
##  9   140     1
## 10   386     1
## # ... with 576 more rows
ggplot(ctu13 %>% filter(port <=80))+
  geom_bar(aes(x=port,fill=class))+
  theme_bw()

#+
 # facet_wrap(~capture,scales = "free_y")
library(scales)

histogram(~modelsize|class,data=ctu13,breaks = 10000,groups=capture,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100)",as.table=T)

As observed, the number of models with very few states are a significant part of the dataset. Since the stratosphere behavioral model needs at least 4 flows for calculating the letter value correctly, all the models with less than 5 letters should be removed, since they wont provide any usefull info in terms of behavioral analisys.

If we replot after removing those models, the frequency distribution is:

ctu13 = ctu13 %>% filter(modelsize>4)
histogram(~modelsize|class,data=ctu13,breaks = 10000,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100) for",color='red')

As can be seen, the number of conections with few states remains larger in the case of Botnet connections. This situation should be analyzed since it could be a strong bias for some algorithms.

Total Number of episodes per class (0=Normal, 1=Botnet)

After removing those episodes with less than 5 flows:

print(
ctu13 %>% group_by(class) %>% summarise(total=n()))
## # A tibble: 2 x 2
##    class total
##    <chr> <int>
## 1 botnet  6169
## 2 normal  2819

Total Number of episodes per label type

After removing those episodes with less than 5 flows:

ctu13_labels=ctu13 %>% group_by(label) %>% summarise(total=n(),avg_modelsize=mean(modelsize))
ctu13_labels=ctu13_labels %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet")) %>% arrange(desc(total))
#ctu13_labels= ctu13_labels %>% mutate(baselabel=gsub("(.*)-(.*)-(.*)","\1\2\3"))
ctu13_labels=ctu13_labels %>% mutate(baselabel=gsub("(Normal|Botnet)-(TCP|UDP)-(.*)-(.*)?","\\1-\\2-\\3",label))


print(ctu13_labels)
## # A tibble: 47 x 5
##                                     label total avg_modelsize  class
##                                     <chr> <int>         <dbl>  <chr>
##  1           Botnet-TCP-SMTP-Attempt-SPAM  4297      36.94415 botnet
##  2                        Normal-TCP-HTTP  2602      72.59493 normal
##  3         Botnet-TCP-HTTP-Established-Ad   487      38.42710 botnet
##  4            Botnet-TCP-HTTP-Established   377      16.74005 botnet
##  5                         Botnet-UDP-DNS   301     153.26578 botnet
##  6        Botnet-TCP-HTTP-Established-SSL   210      18.54286 botnet
##  7       Botnet-TCP-HTTP-CC-Not-Encrypted    68      86.35294 botnet
##  8                  Normal-UDP-NTP-server    67      29.35821 normal
##  9 Botnet-TCP-HTTPS-Established-Microsoft    58      44.18966 botnet
## 10 Botnet-TCP-HTTP-Google-Net-Established    50      10.40000 botnet
## # ... with 37 more rows, and 1 more variables: baselabel <chr>
#ctu13_labels=ctu13_labels %>% filter(label!="Botnet-TCP-Attempt"  & label!="Botnet-TCP-Attempt-SPAM")
size=ggplot(ctu13_labels,aes(x=baselabel,y=total))+
  geom_jitter(aes(size=total,alpha=0.5,colour=class))+
     guides(colour=FALSE,size=FALSE,alpha=F)+
  scale_size_continuous(range = c(2.5,20))+
  ylab("Total Number of Episodes")+xlab("Label")+
  ggtitle("Number of Episodes per Label Type")+
  theme_classic()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(size)
ep=ggplot(ctu13_labels,aes(x=avg_modelsize,y=total))+
  geom_jitter(aes(size=total,alpha=0.5,colour=class,text=label))+
     guides(colour=FALSE,size=FALSE,alpha=F)+
  scale_size_continuous(range = c(2.5,20))+
  ylab("Total Number of Episodes")+xlab("Model Size [AVG]")+
  theme_classic()+
  ggtitle("Number of Episodes per Avg Model size ")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(ep)
readr::write_delim(x=ctu13,path="/home/harpo/Dropbox/ongoing-work/git-repos/stratosphere-deep/datasets/ctu-13bis/ctu-13.labeled.cleaned",delim="|")
ctu13 %>% arrange
## # A tibble: 8,988 x 9
##              src            dst  port proto
##            <chr>          <chr> <int> <chr>
##  1 147.32.84.134    147.32.80.9    53   udp
##  2 147.32.84.134  88.86.100.176    80   tcp
##  3 147.32.84.164 74.125.232.216   443   tcp
##  4 147.32.84.170 74.125.232.206    80   tcp
##  5 147.32.84.170 209.85.149.101    80   tcp
##  6  147.32.84.15   82.208.56.89   123   udp
##  7 147.32.84.170    147.32.80.9    53   udp
##  8 147.32.84.170 209.85.148.105    80   tcp
##  9 147.32.84.170  195.24.233.55    80   tcp
## 10 147.32.84.170  74.125.39.125  5222   tcp
## # ... with 8,978 more rows, and 5 more variables: State <chr>,
## #   label <chr>, class <chr>, capture <chr>, modelsize <int>

The final cleaned version of the dataset can be found here