# Calculate the entropy for a given string
entropy <- function(instr) {
if (mode(instr)!="character")
stop("Expected character array, got ", mode(instr))
sapply(instr, function(x) {
cts <- table(unlist(strsplit(x, NULL)))
lns <- nchar(x)
-sum((cts/lns) * log2(cts/lns))
})
}
vow2conr <- function(str){
vowels=c("a","e","i","o","u")
cons= letters [! letters %in% vowels]
str=tolower(strsplit(str, "")[[1]])
vowcount=as.vector(unlist(table(str%in%vowels)["TRUE"]))
conscount=as.vector(unlist(table(str%in%vowels)["FALSE"]))
return(vowcount/conscount)
}
Read dataset and do some cleaning…
dnsdata=read_tsv('/home/harpo/Dropbox/ongoing-work/dga/data/domaindb.tsv.gz')
dnsdata=dnsdata %>% mutate(label=str_replace(label,"DGA.360","DGA"))
dnsdata=dnsdata %>% separate(label,into=c("label","detail"),sep="[.]{1}")
dnsdata=dnsdata %>% mutate(detail=str_replace(detail,"Cryptolocker","cryptolocker"))
print(dnsdata %>% group_by(label) %>% summarise(count=n()),n=100)
## # A tibble: 2 × 2
## label count
## <chr> <int>
## 1 DGA 1328444
## 2 Normal 999993
print(dnsdata %>% group_by(detail) %>% summarise(count=n()),n=100)
## # A tibble: 49 × 2
## detail count
## <chr> <int>
## 1 Alexa 999993
## 2 bamital 904
## 3 banjori 439218
## 4 bedep 706
## 5 beebone 210
## 6 chinad 256
## 7 conficker 500
## 8 corebot 840
## 9 cryptolocker 21000
## 10 cryptowall 94
## 11 dircrypt 570
## 12 dyre 26993
## 13 fobber 600
## 14 gameover 12000
## 15 geodo 1920
## 16 hesperbot 192
## 17 kraken 9660
## 18 locky 9028
## 19 madmax 1
## 20 matsnu 132
## 21 murofet 49199
## 22 necurs 81920
## 23 nymaim 20225
## 24 P2P 4000
## 25 padcrypt 1920
## 26 Post 220000
## 27 proslikefan 100
## 28 pushdo 2520
## 29 pykspa 25727
## 30 pykspa_v1 18
## 31 pykspa_v2_fake 800
## 32 pykspa_v2_real 200
## 33 qadars 1600
## 34 qakbot 60000
## 35 ramdo 2000
## 36 ramnit 91978
## 37 ranbyus 23167
## 38 rovnix 53632
## 39 shifu 2554
## 40 shiotob/urlzone/bebloh 12521
## 41 simda 28389
## 42 sisron 60
## 43 suppobox 8185
## 44 symmi 4448
## 45 tempedreve 255
## 46 tinba 94912
## 47 vawtrak 300
## 48 virut 11994
## 49 Volatile 996
calculate some features (entropy, vowel2cons ratio, word len)
offset=length(dnsdata$domain)%%100
dnsdata=dnsdata[1:(length(dnsdata$domain)-offset),]
enpar= foreach(i=seq(1,(length(dnsdata$domain)),100), .combine = 'c') %dopar% {
(lapply(dnsdata$domain[i:(i+99)],entropy))
}
len = foreach(i=seq(1,(length(dnsdata$domain)),100), .combine = 'c') %dopar% {
(lapply(dnsdata$domain[i:(i+99)],nchar))
}
vcr= len = foreach(i=seq(1,(length(dnsdata$domain)),100), .combine = 'c') %dopar% {
(lapply(dnsdata$domain[i:(i+99)],vow2conr))
}
dnsdata$entropy=as.vector(unlist(enpar))
dnsdata$len=as.vector(unlist(len))
dnsdata$vcr=as.vector(unlist(vcr))
#dnsdata %>% group_by(detail) %>% summarise(avg_entropy=mean(entropy),sd_entropy=sd(entropy))
#dnsdata %>% group_by(detail) %>% summarise(max_len=max(len),min=min(len))
dns_legit=dnsdata%>% filter(label=="Normal")
dns_dga=dnsdata%>% filter(label!="Normal")
Density plot for Domain Entropy
ggplot(dnsdata)+
geom_density( aes(x=entropy),fill='skyblue',alpha=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Density plot for Domain Entropy (classes)
ggplot()+
geom_density(data=dns_legit, aes(x=entropy),fill='skyblue',alpha=0.5) +
geom_density(data=dns_dga, aes(x=entropy),fill='orange',alpha=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Density plot of Domain Length (classes)
ggplot()+
geom_density(data=dns_legit, aes(x=len),fill='skyblue',alpha=0.5) +
geom_density(data=dns_dga, aes(x=len),fill='orange',alpha=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 5282 rows containing non-finite values (stat_density).
## Warning: Removed 5966 rows containing non-finite values (stat_density).

Density plot for Vowel/consonant ratio (classes)
ggplot()+
geom_density(data=dns_legit, aes(x=vcr),fill='skyblue',alpha=0.5) +
geom_density(data=dns_dga, aes(x=vcr),fill='orange',alpha=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 5282 rows containing non-finite values (stat_density).
## Warning: Removed 5966 rows containing non-finite values (stat_density).

## Warning: Removed 5282 rows containing non-finite values (stat_density).
## Warning: Removed 5966 rows containing non-finite values (stat_density).