CTU13 is a labeled dataset provided by the MFCP.
The pcap files were processed using stratosphere behavioral models
library(dplyr)
library(ggplot2)
library(lattice)
library(plotly)
#ctu13=readr::read_delim(delim='|',file="https://www.dropbox.com/s/i34ngk4lm273g33/ctu13-labeled?dl=1")
ctu13=readr::read_delim(delim='|',file="/home/harpo/Dropbox/ongoing-work/git-repos/stratosphere-deep/datasets/ctu-13bis/ctu13-labeled")
ctu13
## # A tibble: 81,617 x 6
## src dst port proto
## <chr> <chr> <int> <chr>
## 1 147.32.80.9 147.32.86.111 54230 udp
## 2 147.32.80.9 147.32.86.111 58314 udp
## 3 147.32.80.9 147.32.86.111 54823 udp
## 4 147.32.84.134 147.32.80.9 53 udp
## 5 147.32.84.134 88.86.100.176 80 tcp
## 6 147.32.84.164 74.125.232.216 443 tcp
## 7 147.32.84.170 74.125.232.206 80 tcp
## 8 147.32.84.170 209.85.149.101 80 tcp
## 9 147.32.84.15 82.208.56.89 123 udp
## 10 147.32.84.170 147.32.80.9 53 udp
## # ... with 81,607 more rows, and 2 more variables: State <chr>,
## # label <chr>
A new field is added to simplify fitering by the class [normal,botnet]
ctu13=ctu13 %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet"))
ctu13 %>% group_by(class) %>% summarise(total=n())
## # A tibble: 2 x 2
## class total
## <chr> <int>
## 1 botnet 69199
## 2 normal 12418
Some cleaning in the labels is performed for a better aggregation
# some text cleaning for aggregating labels
ctu13=ctu13 %>% mutate(label=gsub("From-|To-","",label)) %>%
mutate(label=gsub("-[0-9]+$","",label)) %>%
mutate(capture=gsub(".*-(V[0-9]+)(-[0-9]+)?.*","\\1",label) ) %>%
mutate(label=gsub("-V[0-9]+(-[0-9]+)?","",label)) %>%
mutate(label=gsub("-[0-9]+$","",label)) %>%
mutate(label=gsub("Botnet-TCP-Attempt$","Botnet-TCP-Attempt-SPAM",label)) %>%
mutate(label=gsub("Established-HTTP","HTTP-Established",label)) %>%
mutate(label=gsub("WEB","HTTP",label)) %>%
mutate(label=gsub("Botnet-TCP-Established-SSL-Microsoft","Botnet-TCP-HTTPS-Established-Microsoft",label)) %>%
mutate(label=gsub("CC[0-9]+-","CC-",label)) %>%
mutate(label=gsub("Botnet-TCP-CC-HTTP","Botnet-TCP-HTTP-CC",label)) %>%
mutate(label=gsub("TCP-CC-IRC","TCP-IRC-CC",label)) %>%
mutate(label=gsub("Normal-CVUT-WebServer","Normal-TCP-HTTP-CVUT-WebServer",label)) %>%
mutate(label=gsub("Botnet-TCP-Established-SPAM","Botnet-TCP-SMTP-Established-SPAM",label)) %>%
mutate(label=gsub("Botnet-TCP-Attempt-SPAM","Botnet-TCP-SMTP-Attempt-SPAM",label)) %>%
mutate(label=gsub("Normal-UDP-CVUT-DNS-Server","Normal-UDP-DNS-CVUT-DNSServer",label)) %>%
mutate(label=gsub("Attempt-DNS","DNS-Attempt",label)) %>%
mutate(label=gsub("Botnet-TCP-CC-Plain-HTTP-Encrypted-Data","Botnet-TCP-HTTP-CC-Plain-Encrypted-Data",label)) %>%
#mutate(label=gsub("","",label)) %>%
mutate(label=gsub("Botnet-TCP-Not-Encrypted-SMTP-Private-Proxy","Botnet-TCP-SMTP-Not-Encrypted-Private-Proxy",label)) %>%
mutate(label=gsub("Normal-MatLab-Server","Normal-UDP-Matlab-server",label)) %>%
mutate(modelsize=nchar(State))
head(ctu13)
## # A tibble: 6 x 9
## src dst port proto
## <chr> <chr> <int> <chr>
## 1 147.32.80.9 147.32.86.111 54230 udp
## 2 147.32.80.9 147.32.86.111 58314 udp
## 3 147.32.80.9 147.32.86.111 54823 udp
## 4 147.32.84.134 147.32.80.9 53 udp
## 5 147.32.84.134 88.86.100.176 80 tcp
## 6 147.32.84.164 74.125.232.216 443 tcp
## # ... with 5 more variables: State <chr>, label <chr>, class <chr>,
## # capture <chr>, modelsize <int>
ctu13 %>% filter(label=="Normal-Jist") %>% group_by(port,proto) %>% summarise(n=n())
## # A tibble: 21 x 3
## # Groups: port [?]
## port proto n
## <int> <chr> <int>
## 1 22 tcp 1
## 2 53 udp 12
## 3 80 tcp 1029
## 4 123 udp 5
## 5 443 tcp 507
## 6 465 tcp 3
## 7 843 tcp 1
## 8 993 tcp 22
## 9 1863 tcp 8
## 10 1935 tcp 1
## # ... with 11 more rows
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==80,"Normal-TCP-HTTP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==443,"Normal-TCP-HTTP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==53,"Normal-UDP-DNS"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Stribrek" & port==5222,"Normal-TCP-Jabber"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==80,"Normal-TCP-HTTP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==443,"Normal-TCP-HTTP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==53,"Normal-UDP-DNS"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==5222,"Normal-TCP-Jabber"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Grill" & port==27031,"Normal-TCP-Game"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==80,"Normal-TCP-HTTP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==443,"Normal-TCP-HTTP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==53,"Normal-UDP-DNS"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==5222,"Normal-TCP-Jabber"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==993,"Normal-TCP-IMAP"))
ctu13<-ctu13 %>% mutate(label=replace(label, label=="Normal-Jist" & port==1863,"Normal-TCP-MSN"))
ctu13 %>% group_by(capture) %>% summarise(n=n())
## # A tibble: 13 x 2
## capture n
## <chr> <int>
## 1 V42 5130
## 2 V43 1868
## 3 V44 27398
## 4 V45 1513
## 5 V46 510
## 6 V47 1908
## 7 V48 141
## 8 V49 6842
## 9 V50 27778
## 10 V51 3646
## 11 V52 555
## 12 V53 2143
## 13 V54 2185
ggplot(ctu13)+
geom_bar(aes(x=class,fill=class))+
theme_bw()+
facet_wrap(~capture,scales = "free_y")
ctu13 %>% filter(capture=='V49') %>% group_by(port) %>% summarise(n=n())
## # A tibble: 586 x 2
## port n
## <int> <int>
## 1 22 1
## 2 53 6
## 3 80 793
## 4 81 4
## 5 88 2
## 6 123 18
## 7 135 434
## 8 137 1
## 9 140 1
## 10 386 1
## # ... with 576 more rows
ggplot(ctu13 %>% filter(port <=80))+
geom_bar(aes(x=port,fill=class))+
theme_bw()
#+
# facet_wrap(~capture,scales = "free_y")
library(scales)
histogram(~modelsize|class,data=ctu13,breaks = 10000,groups=capture,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100)",as.table=T)
As observed, the number of models with very few states are a significant part of the dataset. Since the stratosphere behavioral model needs at least 4 flows for calculating the letter value correctly, all the models with less than 5 letters should be removed, since they wont provide any usefull info in terms of behavioral analisys.
If we replot after removing those models, the frequency distribution is:
ctu13 = ctu13 %>% filter(modelsize>4)
histogram(~modelsize|class,data=ctu13,breaks = 10000,xlim=c(0,100),main="Model Size Freq. Distribution (modelsize <100) for",color='red')
As can be seen, the number of conections with few states remains larger in the case of Botnet connections. This situation should be analyzed since it could be a strong bias for some algorithms.
After removing those episodes with less than 5 flows:
print(
ctu13 %>% group_by(class) %>% summarise(total=n()))
## # A tibble: 2 x 2
## class total
## <chr> <int>
## 1 botnet 6169
## 2 normal 2819
After removing those episodes with less than 5 flows:
ctu13_labels=ctu13 %>% group_by(label) %>% summarise(total=n(),avg_modelsize=mean(modelsize))
ctu13_labels=ctu13_labels %>% mutate(class=ifelse(grepl("Normal",label),"normal","botnet")) %>% arrange(desc(total))
#ctu13_labels= ctu13_labels %>% mutate(baselabel=gsub("(.*)-(.*)-(.*)","\1\2\3"))
ctu13_labels=ctu13_labels %>% mutate(baselabel=gsub("(Normal|Botnet)-(TCP|UDP)-(.*)-(.*)?","\\1-\\2-\\3",label))
print(ctu13_labels)
## # A tibble: 47 x 5
## label total avg_modelsize class
## <chr> <int> <dbl> <chr>
## 1 Botnet-TCP-SMTP-Attempt-SPAM 4297 36.94415 botnet
## 2 Normal-TCP-HTTP 2602 72.59493 normal
## 3 Botnet-TCP-HTTP-Established-Ad 487 38.42710 botnet
## 4 Botnet-TCP-HTTP-Established 377 16.74005 botnet
## 5 Botnet-UDP-DNS 301 153.26578 botnet
## 6 Botnet-TCP-HTTP-Established-SSL 210 18.54286 botnet
## 7 Botnet-TCP-HTTP-CC-Not-Encrypted 68 86.35294 botnet
## 8 Normal-UDP-NTP-server 67 29.35821 normal
## 9 Botnet-TCP-HTTPS-Established-Microsoft 58 44.18966 botnet
## 10 Botnet-TCP-HTTP-Google-Net-Established 50 10.40000 botnet
## # ... with 37 more rows, and 1 more variables: baselabel <chr>
#ctu13_labels=ctu13_labels %>% filter(label!="Botnet-TCP-Attempt" & label!="Botnet-TCP-Attempt-SPAM")
size=ggplot(ctu13_labels,aes(x=baselabel,y=total))+
geom_jitter(aes(size=total,alpha=0.5,colour=class))+
guides(colour=FALSE,size=FALSE,alpha=F)+
scale_size_continuous(range = c(2.5,20))+
ylab("Total Number of Episodes")+xlab("Label")+
ggtitle("Number of Episodes per Label Type")+
theme_classic()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(size)
ep=ggplot(ctu13_labels,aes(x=avg_modelsize,y=total))+
geom_jitter(aes(size=total,alpha=0.5,colour=class,text=label))+
guides(colour=FALSE,size=FALSE,alpha=F)+
scale_size_continuous(range = c(2.5,20))+
ylab("Total Number of Episodes")+xlab("Model Size [AVG]")+
theme_classic()+
ggtitle("Number of Episodes per Avg Model size ")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(ep)
readr::write_delim(x=ctu13,path="/home/harpo/Dropbox/ongoing-work/git-repos/stratosphere-deep/datasets/ctu-13bis/ctu-13.labeled.cleaned",delim="|")
ctu13 %>% arrange
## # A tibble: 8,988 x 9
## src dst port proto
## <chr> <chr> <int> <chr>
## 1 147.32.84.134 147.32.80.9 53 udp
## 2 147.32.84.134 88.86.100.176 80 tcp
## 3 147.32.84.164 74.125.232.216 443 tcp
## 4 147.32.84.170 74.125.232.206 80 tcp
## 5 147.32.84.170 209.85.149.101 80 tcp
## 6 147.32.84.15 82.208.56.89 123 udp
## 7 147.32.84.170 147.32.80.9 53 udp
## 8 147.32.84.170 209.85.148.105 80 tcp
## 9 147.32.84.170 195.24.233.55 80 tcp
## 10 147.32.84.170 74.125.39.125 5222 tcp
## # ... with 8,978 more rows, and 5 more variables: State <chr>,
## # label <chr>, class <chr>, capture <chr>, modelsize <int>
The final cleaned version of the dataset can be found here