library(dplyr)
library(readr)
library(ggplot2)

Results from evaluating new domains using cacic web service

CACIC service uses the default architecture from bitbucket repo

Dataset

The dataset contains 3465 domains names. These domains were obtained from 3 datasets sent by Vaclav:

  1. 1500 domains names labeled as DGA by the original NN
  2. 1295 domains labeled as DGA and Normal by the original NN
  3. 1113 domains names labeled by the original NN as DGa and incorrectly classified as normal by Vaclav NN implementation

We merge the datasets and removed duplicated. Final dataset can be found here.

Confusion matrix

caret::confusionMatrix(as.factor(vaclav_results$class),as.factor(vaclav_results$label))
Confusion Matrix and Statistics

          Reference
Prediction  dga normal
    dga    2654     15
    normal  470    326
                                         
               Accuracy : 0.86           
                 95% CI : (0.848, 0.8714)
    No Information Rate : 0.9016         
    P-Value [Acc > NIR] : 1              
                                         
                  Kappa : 0.5053         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.8496         
            Specificity : 0.9560         
         Pos Pred Value : 0.9944         
         Neg Pred Value : 0.4095         
             Prevalence : 0.9016         
         Detection Rate : 0.7659         
   Detection Prevalence : 0.7703         
      Balanced Accuracy : 0.9028         
                                         
       'Positive' Class : dga            
                                         

FALSE NEGATIVES Only 470 domains labeled as DGA by the original NN service where not detected by the cacic web service

Interactive 3D representation of the TRUE POSITIVE (Blue) and FALSE (Orange) NEGATIVE DOMAINS

plotly::plot_ly(pca_data  , type="scatter3d", 
                x = ~PC1, y = ~PC2, z = ~PC3, color = ~label,
                colors = c('#BF382A', '#0C4B8E'), 
                opacity=0.5, marker = list(size = 2),text = ~paste(preds," ",domain))  
No scatter3d mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
No scatter3d mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

We can observe three clusters corresponding to the .hosting , .org and .feedback TLDs. TP and FN are observed in the three clusters. The table below shows the percent of FN for each cluster. The org TLD is the the cluster with the higher FN with a 23% followed by the hosting with 18% and feedback with 3%

char_dist<- function(tld_name){
  library(scales)
  
  charlist_FN=fn_domains %>% filter(grepl(tld_name,domain))  %>% do(charlist= unlist(sapply(.$domain, function(x) c(str_split(x,"")[1]))))
  charlist_TP=tp_domains %>% filter(grepl(tld_name,domain)) %>% do(charlist= unlist(sapply(.$domain, function(x) c(str_split(x,"")[1]))))
  
  charlist_FN=as.vector(unlist(charlist_FN %>% select(charlist)))
  charlist_TP=as.vector(unlist(charlist_TP %>% select(charlist)))
  
  tpplot<-ggplot(data.frame(charlist=charlist_TP),aes(x=charlist))+
    geom_bar(col="black",fill='white',aes(y = (..count..)/sum(..count..)))+
    scale_y_continuous(labels=percent)+ylab("Percent")+xlab("")+
    theme_bw()
  
  
  fnplot<-ggplot(data.frame(charlist=charlist_FN),aes(x=charlist))+
    geom_bar(col="black",fill='black',aes(y = (..count..)/sum(..count..)))+
    scale_y_continuous(labels=percent)+ylab("Percent")+xlab("")+
    theme_bw()
  
  gridExtra::grid.arrange(fnplot,tpplot,ncol=1)
  return(list(fnplot=fnplot,tpplot=tpplot))
}

Question: What is the difference between TP and FN in the org TLD?

The Character Frequency Histogram for TP (white) and FN (black)

Question: What is the difference between TP and FN in the hosting TLD?

The Character Frequency Histogram for TP (white) and FN (black)

org_plot<-char_dist("hosting")

Question: What is the difference between TP and FN in the feedback TLD?

The Character Frequency Histogram for TP (white) and FN (black)

org_plot<-char_dist("feedback")

CONCLUSIONS

No significant differences are observable between the FP and TP histograms for the three considered domains. A possible solution could be to include a portion of the non detected DGA into the dataset and retrain de model.

tensorboard("logs") 
---
title: "DGA NN Model Comparison"
output: 
  html_notebook: 
    code_folding: hide
---

```{r}

library(dplyr)
library(readr)
library(ggplot2)
```
```{r eval=FALSE, include=FALSE}

vaclav_results<-read_csv("./test_result.csv")
cacic_results<-read_csv("./results.csv",col_names = FALSE)
old_backported_results<-read_csv("./preds_backported.csv")
cacic_python_results <-read_csv("./preds_cacic_python.csv")
cacic_python_results2 <-read_csv("./preds_cacic_python_v2.csv")

```

```{r eval=FALSE, fig.height=12, fig.width=4, include=FALSE}
results<-cbind(vaclav_results,cacic_results,old_backported_results$preds_original_backported,cacic_python_results$preds_cacic_python, cacic_python_results2$pred_cacic_model)

names(results)<-c("domain","original","new","cacic","backported","cacic_python","cacic_python2")

results %>% mutate(original_class=ifelse(original>0.9,"dga","normal"),
                   cacic_class=ifelse(cacic>0.9,"dga","normal"),
                   new_class=ifelse(new>0.9,"dga","normal"),
                   backported_class=ifelse(backported>0.9,"dga","normal"),
                   cacic_python_class=ifelse(cacic_python>0.9,"dga","normal")
                   ) %>% select(domain,original,cacic,new,backported,cacic_python) %>% reshape2::melt(id.vars='domain') %>%
  ggplot()+
  geom_tile(aes(y=domain,x=variable, alpha=value,fill=value),color='skyblue')+
   theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.x=element_blank()
        )+
  xlab("model")+ylab("")+labs(fill="probability")+
   theme(axis.text.x=element_text(angle=45, hjust=1))



plotly::ggplotly()
```

```{r eval=FALSE, include=FALSE}
library(dplyr)
results_class<-results %>% mutate(original_class=ifelse(original>0.9,"dga","normal"),
                   cacic_class=ifelse(cacic>0.9,"dga","normal"),
                   new_class=ifelse(new>0.9,"dga","normal"),
                   backported_class=ifelse(backported>0.9,"dga","normal"),
                   cacic_python_class=ifelse(cacic_python>0.9,"dga","normal"),
                   cacic_python_class2=ifelse(cacic_python2>0.9,"dga","normal")
                   ) %>% select(domain,original_class,cacic_python_class,cacic_class,cacic_python_class2)

results %>% select(domain,original,cacic_python2) %>% readr::write_csv("originalvscacic_results.csv")

caret::confusionMatrix(as.factor(results_class$original_class),as.factor(results_class$cacic_python_class2))


```
```{r eval=FALSE, include=FALSE}
dataset_1500<-read_csv("results_dataset_1500.csv")
dataset_1500 %>% mutate(label=ifelse(original>0.9,"dga","normal")) %>% select(domain,label) %>% readr::write_csv("1295_data.csv")
```

```{r eval=FALSE, include=FALSE}
dataset_1500
dataset_1500[1500,]$original=0

dataset_1500_class<-dataset_1500 %>% mutate(
                   original_class = ifelse(original>0.9,"dga","normal"),
                   cacic_class = ifelse(cacic_servicio>0.9,"dga","normal"),
                   cacic_python_vaclav_class = ifelse(cacic_python_vaclav>0.9,"dga","normal"),
                   cacic_python_class = ifelse(cacic_python>0.9,"dga","normal")
                   
                   )# %>% select(domain,original_class,cacic_python_vaclav_class,cacic_class,cacic_python_class)


caret::confusionMatrix(as.factor(dataset_1500_class$cacic_class), as.factor(dataset_1500_class$original_class) )

```

```{r eval=FALSE, include=FALSE}
library(abind)
fn_domains<-dataset_1500_class %>% filter(original_class=='dga' & cacic_python_class=='normal') %>% select(domain,cacic_python,cacic_python_class) 
tp_domains<-dataset_1500_class %>% filter(original_class=='dga' & cacic_python_class=='dga') %>% select(domain,cacic_python) 

fn_domains$label<-rep("False Negative", length(fn_domains$domain))
tp_domains$label<-rep("True Positive", length(tp_domains$domain))


FN_tokenized=tokenize(as.matrix(fn_domains$domain),fn_domains$label)
TP_tokenized=tokenize(as.matrix(tp_domains$domain),tp_domains$label)

malware_results=list()
malware_results$encode<-abind(FN_tokenized$encode,TP_tokenized$encode,along=1)
malware_results$domain<-c(FN_tokenized$domain,TP_tokenized$domain)
malware_results$label<-c(as.character(FN_tokenized$label),as.character(TP_tokenized$label))
malware_results$preds<-c(fn_domains$cacic_python,tp_domains$cacic_python)
pca=prcomp(malware_results$encode[,1:20],center=TRUE,scale.=TRUE)
  pca_data<-data.frame(pca$x,label=malware_results$label,domain=malware_results$domain,preds=malware_results$preds)
  pca_plot<-ggplot(pca_data %>% sample_n(500),aes(x=PC1,y=PC3))+
    geom_point(aes(color=label,text=domain,shape=as.factor(label)),alpha=0.5)+
    theme_bw()

plotly::plot_ly(pca_data , type="scatter3d", 
                x = ~PC1, y = ~PC2, z = ~PC3, color = ~label,
                colors = c('#BF382A', '#0C4B8E'), 
                opacity=0.5, marker = list(size = 3),text = ~paste(preds," ",domain))  
```



# Results from evaluating new domains using cacic web service
CACIC service uses the default architecture from bitbucket repo

* Service located at http://catanuso.duckdns.org:8000/predict?domain=www.google.com


## Dataset
The dataset contains 3465 domains names. These domains were obtained from 3 datasets sent by Vaclav:

1. 1500 domains names labeled as DGA by the original NN
2. 1295 domains labeled as DGA and Normal by the original NN
3. 1113 domains names labeled by the original NN as DGa and incorrectly classified as normal by Vaclav NN implementation

We merge the datasets and removed duplicated. Final dataset can be found [here](https://www.dropbox.com/s/5jyf0zix67ewoxm/vaclav_results_labeled.csv?dl=1).

```{r}

vaclav_dataset<-read_csv("vaclav.csv") # results from original NN service running at whalebone
vaclav_results<-read_csv("vaclav_results.csv") # results from querying cacic web service
vaclav_results<-cbind(vaclav_results,label=vaclav_dataset$label) %>% select(domain,probability,class,label)
readr::write_csv(vaclav_results,"vaclav_results_labeled.csv")
vaclav_results %>% group_by(class) %>% summarise(n=n())
```
## Confusion matrix
```{r}
caret::confusionMatrix(as.factor(vaclav_results$class),as.factor(vaclav_results$label))
```

**FALSE NEGATIVES ** Only 470 domains labeled as DGA by the original NN service where not detected by the cacic web service

### Interactive 3D representation of the TRUE POSITIVE (Blue) and FALSE (Orange) NEGATIVE DOMAINS

```{r fig.width=8}
library(abind)
fn_domains<-vaclav_results %>% filter(class=='normal' & label=='dga') 
tp_domains<-vaclav_results %>% filter(class=='dga' & label=='dga')  

fn_domains$label<-rep("False Negative", length(fn_domains$domain))
tp_domains$label<-rep("True Positive", length(tp_domains$domain))


FN_tokenized=tokenize(as.matrix(fn_domains$domain),fn_domains$label)
TP_tokenized=tokenize(as.matrix(tp_domains$domain),tp_domains$label)

malware_results=list()
malware_results$encode<-abind(FN_tokenized$encode,TP_tokenized$encode,along=1)
malware_results$domain<-c(FN_tokenized$domain,TP_tokenized$domain)
malware_results$label<-c(as.character(FN_tokenized$label),as.character(TP_tokenized$label))
malware_results$preds<-c(fn_domains$probability,tp_domains$probability)
pca=prcomp(malware_results$encode[,1:20],center=TRUE,scale.=TRUE)
  pca_data<-data.frame(pca$x,label=malware_results$label,domain=malware_results$domain,preds=malware_results$preds)
  pca_plot<-ggplot(pca_data %>% sample_n(500),aes(x=PC1,y=PC3))+
    geom_point(aes(color=label,text=domain,shape=as.factor(label)),alpha=0.5)+
    theme_bw()

plotly::plot_ly(pca_data  , type="scatter3d", 
                x = ~PC1, y = ~PC2, z = ~PC3, color = ~label,
                colors = c('#BF382A', '#0C4B8E'), 
                opacity=0.5, marker = list(size = 2),text = ~paste(preds," ",domain))  
```

We can observe three clusters corresponding to the **.hosting** , **.org** and **.feedback**  TLDs.
TP and FN are observed in the three clusters. The table below shows the percent of FN for each cluster.
The **org** TLD is the the cluster with the higher FN with a 23% followed by the **hosting** with 18% and **feedback** with 3%

```{r}
data.frame(domain=malware_results$domain,label=malware_results$label) %>% mutate(tld=ifelse( grepl("org",domain ),"org",
                                                                                             ifelse( grepl("feedback",domain) ,"feedback",
                                                                                              ifelse( grepl("hosting",domain) , "hosting", "other"))))  %>%
 group_by(tld,label) %>% summarise(total=n()) %>% mutate(percent=total/sum(total)) %>% filter(label=="False Negative")%>% select(tld,percent)
```




```{r}
char_dist<- function(tld_name){
  library(scales)
  
  charlist_FN=fn_domains %>% filter(grepl(tld_name,domain))  %>% do(charlist= unlist(sapply(.$domain, function(x) c(str_split(x,"")[1]))))
  charlist_TP=tp_domains %>% filter(grepl(tld_name,domain)) %>% do(charlist= unlist(sapply(.$domain, function(x) c(str_split(x,"")[1]))))
  
  charlist_FN=as.vector(unlist(charlist_FN %>% select(charlist)))
  charlist_TP=as.vector(unlist(charlist_TP %>% select(charlist)))
  
  tpplot<-ggplot(data.frame(charlist=charlist_TP),aes(x=charlist))+
    geom_bar(col="black",fill='white',aes(y = (..count..)/sum(..count..)))+
    scale_y_continuous(labels=percent)+ylab("Percent")+xlab("")+
    theme_bw()
  
  
  fnplot<-ggplot(data.frame(charlist=charlist_FN),aes(x=charlist))+
    geom_bar(col="black",fill='black',aes(y = (..count..)/sum(..count..)))+
    scale_y_continuous(labels=percent)+ylab("Percent")+xlab("")+
    theme_bw()
  
  gridExtra::grid.arrange(fnplot,tpplot,ncol=1)
  return(list(fnplot=fnplot,tpplot=tpplot))
}

```
##Question: What is the difference between TP and FN in the **org** TLD?
The Character Frequency Histogram for TP (white) and FN (black)
```{r}
org_plot<-char_dist("org")
```

## Question: What is the difference between TP and FN in the **hosting** TLD?
The Character Frequency Histogram for TP (white) and FN (black)
```{r}
org_plot<-char_dist("hosting")
```


## Question: What is the difference between TP and FN in the **feedback** TLD?
The Character Frequency Histogram for TP (white) and FN (black)

```{r}
org_plot<-char_dist("feedback")
```

# CONCLUSIONS

No significant differences are observable between the FP and TP histograms for the three considered domains.
A possible solution could be to include a portion of the non detected DGA into the dataset and retrain de model.



```{r eval=FALSE, include=FALSE}
library(stringr)

fn_numfreq<-data.frame(domain=fn_domains$domain,numfreq=str_split(fn_domains$domain,"\\.",simplify = T)[,1] %>%  str_count("[0-9]") /  str_length(str_split(fn_domains$domain,"\\.",simplify = T)[,1]),type=rep("fn"),preds=fn_domains$probability)
tp_numfreq<-data.frame(domain=tp_domains$domain,numfreq=str_split(tp_domains$domain,"\\.",simplify = T)[,1] %>%  str_count("[0-9]") /  str_length(str_split(tp_domains$domain,"\\.",simplify = T)[,1]),type=rep("tp"),preds=tp_domains$probability)

rbind(fn_numfreq,tp_numfreq) %>% filter (grepl("org",domain)) %>%
                           
  ggplot()+
  geom_histogram(aes(x=numfreq),alpha=0.5,fill='skyblue',color='blue')+
  facet_wrap(~type)+
  theme_bw()
  

fn_numfreq %>% arrange(numfreq)

```

```{r eval=FALSE, include=FALSE}
FN_tokenized$encode
```

```{r eval=FALSE, include=FALSE}
library(keras)
malware_results$encode %>% nrow()

input_shape <- dim(malware_results$encode)[2]
inputs<-layer_input(shape = input_shape)
embeding<- inputs %>% layer_embedding(40, 100 , input_length = input_shape) %>%
layer_flatten() %>% layer_dense(units = 1, activation = "sigmoid")

model <- keras_model(inputs = inputs, outputs = embeding)
model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("acc")
)

model %>% fit(x=malware_results$encode,
              y=ifelse(grepl("False Negative",
              malware_results$label),1,0),
              batch_size = 32,
              verbose = 2,
              ep=10,
              callbacks = callback_tensorboard("logs/run_a",
                                               histogram_freq = 0,
                                               embeddings_freq = 1
                                               )
              ) 
```
```{r}
tensorboard("logs") 
```

