# Calculate the entropy for a given string
entropy <- function(instr) {
  if (mode(instr)!="character")
    stop("Expected character array, got ", mode(instr))
  sapply(instr, function(x) {
    cts <- table(unlist(strsplit(x, NULL)))
    lns <- nchar(x)
    -sum((cts/lns) * log2(cts/lns))    
  })
}

vow2conr <- function(str){
  vowels=c("a","e","i","o","u")
  cons= letters [! letters %in% vowels]
  str=tolower(strsplit(str, "")[[1]])
  vowcount=as.vector(unlist(table(str%in%vowels)["TRUE"]))
  conscount=as.vector(unlist(table(str%in%vowels)["FALSE"]))
  
  return(vowcount/conscount)
}

Read dataset and do some cleaning…

dnsdata=read_tsv('/home/harpo/Dropbox/ongoing-work/dga/data/domaindb.tsv.gz')
dnsdata=dnsdata %>%  mutate(label=str_replace(label,"DGA.360","DGA"))
dnsdata=dnsdata %>% separate(label,into=c("label","detail"),sep="[.]{1}")
dnsdata=dnsdata %>%  mutate(detail=str_replace(detail,"Cryptolocker","cryptolocker"))
print(dnsdata %>% group_by(label) %>% summarise(count=n()),n=100)
## # A tibble: 2 × 2
##    label   count
##    <chr>   <int>
## 1    DGA 1328444
## 2 Normal  999993
print(dnsdata %>% group_by(detail) %>% summarise(count=n()),n=100)
## # A tibble: 49 × 2
##                    detail  count
##                     <chr>  <int>
## 1                   Alexa 999993
## 2                 bamital    904
## 3                 banjori 439218
## 4                   bedep    706
## 5                 beebone    210
## 6                  chinad    256
## 7               conficker    500
## 8                 corebot    840
## 9            cryptolocker  21000
## 10             cryptowall     94
## 11               dircrypt    570
## 12                   dyre  26993
## 13                 fobber    600
## 14               gameover  12000
## 15                  geodo   1920
## 16              hesperbot    192
## 17                 kraken   9660
## 18                  locky   9028
## 19                 madmax      1
## 20                 matsnu    132
## 21                murofet  49199
## 22                 necurs  81920
## 23                 nymaim  20225
## 24                    P2P   4000
## 25               padcrypt   1920
## 26                   Post 220000
## 27            proslikefan    100
## 28                 pushdo   2520
## 29                 pykspa  25727
## 30              pykspa_v1     18
## 31         pykspa_v2_fake    800
## 32         pykspa_v2_real    200
## 33                 qadars   1600
## 34                 qakbot  60000
## 35                  ramdo   2000
## 36                 ramnit  91978
## 37                ranbyus  23167
## 38                 rovnix  53632
## 39                  shifu   2554
## 40 shiotob/urlzone/bebloh  12521
## 41                  simda  28389
## 42                 sisron     60
## 43               suppobox   8185
## 44                  symmi   4448
## 45             tempedreve    255
## 46                  tinba  94912
## 47                vawtrak    300
## 48                  virut  11994
## 49               Volatile    996

calculate some features (entropy, vowel2cons ratio, word len)

offset=length(dnsdata$domain)%%100
dnsdata=dnsdata[1:(length(dnsdata$domain)-offset),]
enpar= foreach(i=seq(1,(length(dnsdata$domain)),100), .combine = 'c') %dopar% {
    (lapply(dnsdata$domain[i:(i+99)],entropy))
  }

len = foreach(i=seq(1,(length(dnsdata$domain)),100), .combine = 'c') %dopar% {
    (lapply(dnsdata$domain[i:(i+99)],nchar))
}

vcr= len = foreach(i=seq(1,(length(dnsdata$domain)),100), .combine = 'c') %dopar% {
    (lapply(dnsdata$domain[i:(i+99)],vow2conr))
}

  
dnsdata$entropy=as.vector(unlist(enpar))
dnsdata$len=as.vector(unlist(len))
dnsdata$vcr=as.vector(unlist(vcr))

#dnsdata %>% group_by(detail) %>% summarise(avg_entropy=mean(entropy),sd_entropy=sd(entropy))
#dnsdata %>% group_by(detail) %>% summarise(max_len=max(len),min=min(len))
dns_legit=dnsdata%>% filter(label=="Normal")
dns_dga=dnsdata%>% filter(label!="Normal")

Density plot for Domain Entropy

ggplot(dnsdata)+
  geom_density( aes(x=entropy),fill='skyblue',alpha=0.5) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Density plot for Domain Entropy (classes)

ggplot()+
  geom_density(data=dns_legit, aes(x=entropy),fill='skyblue',alpha=0.5) + 
  geom_density(data=dns_dga, aes(x=entropy),fill='orange',alpha=0.5) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Density plot of Domain Length (classes)

ggplot()+
  geom_density(data=dns_legit, aes(x=len),fill='skyblue',alpha=0.5) + 
  geom_density(data=dns_dga, aes(x=len),fill='orange',alpha=0.5) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 5282 rows containing non-finite values (stat_density).
## Warning: Removed 5966 rows containing non-finite values (stat_density).

Density plot for Vowel/consonant ratio (classes)

ggplot()+
  geom_density(data=dns_legit, aes(x=vcr),fill='skyblue',alpha=0.5) + 
  geom_density(data=dns_dga, aes(x=vcr),fill='orange',alpha=0.5) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 5282 rows containing non-finite values (stat_density).
## Warning: Removed 5966 rows containing non-finite values (stat_density).

ggplotly()
## Warning: Removed 5282 rows containing non-finite values (stat_density).

## Warning: Removed 5966 rows containing non-finite values (stat_density).