library(readr)
library(dplyr)
library(broom)
library(stringr)
library(ggplot2)

IMBALANCE STRATEGIES

We have tried several approaches for dealing with imbalaced classes. We have used the Woodbridge (2016) LSTM network for evaluating the different imbalaced approaches.

10% SAMPLE

The following sections presents the results of the different imabalance techniques on samples of 10% of the CTU19A dataset ### Downsampling

files <- list.files(path = "../results/",pattern="results_test_imbalance-downsampling-epochs=60-endgame-maxlen=200-\\d+")
#str_replace(string = "results_test_ctu13-lstm_endgame-80-1.csv",pattern = ".*-([0-9]+)-[0-9]+.csv","\\1")
results_lstm_downsample <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_downsample <-do.call(rbind,results_lstm_downsample)

Upsampling

files <- list.files(path = "../results/",pattern="results_test_imbalance-upsampling-epochs=60-endgame-maxlen=200-\\d+")
#str_replace(string = "results_test_ctu13-lstm_endgame-80-1.csv",pattern = ".*-([0-9]+)-[0-9]+.csv","\\1")
results_lstm_upsample <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""),col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_upsample <-do.call(rbind,results_lstm_upsample)

Augmenting data (shift sequence approach)

Only normal data

files <- list.files(path = "../results/",pattern="results_test_imbalance-augmenting-epochs=60-endgame-maxlen=200-\\d+")
#str_replace(string = "results_test_ctu13-lstm_endgame-80-1.csv",pattern = ".*-([0-9]+)-[0-9]+.csv","\\1")
results_lstm_augment <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""),col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_augment <-do.call(rbind,results_lstm_augment)

Normal and Botnet

files <- list.files(path = "../results/",pattern="results_test_imbalance-augmenting-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment4x <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""),col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_augment4x <-do.call(rbind,results_lstm_augment4x)
files %>% length()
[1] 30

Comparison

results<-rbind(results_lstm_upsample %>% tibble::add_column(imbalance="upsample"),
               results_lstm_downsample %>% tibble::add_column(imbalance="downsample"), 
               results_lstm_augment %>% tibble::add_column(imbalance="aug-norm"),
               results_lstm_augment4x %>% tibble::add_column(imbalance="aug-norm-bot"))
results %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity")) %>%
  ggplot() +
  labs(title="Imbalance Strategies [30 executions]",
        subtitle="LSTM arch. according to Woodbridge et al. (2016):\nmaxlen 200, epochs 60",
        caption="Original sample contains:  ~1800 botnets and ~180 normal data points.\n
       aug-norm and aug-norm-bot use a sliding window over sequence for generate new samples. ")+
  
  geom_boxplot(aes(x=as.factor(imbalance),y=value,fill=as.factor(imbalance)),color='gray')+
  xlab("Strategy")+
  theme_bw()+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position="none")+
  facet_wrap(~metric, scales = "fixed")

20% SAMPLE

The following sections presents the results of the different imabalance techniques on samples of 10% of the CTU19A dataset. The 20% was choosen based on a carefull analisys of the differences between pop. and sample distribution.

Augmenting 20% botnet 2x

We apply the shift sequence data-augmentation approach for generating 2x botnet and then generate the same amount of normal data.

files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-botnetx2-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x2 <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_augment_sample20x2 <-do.call(rbind,results_lstm_augment_sample20x2)
files %>% length()
[1] 30

Augmenting 20% botnet 1x

We apply the shift sequence data-augmentation approach for generating 1x botnet and then generate the same amount of normal data.

files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x3 <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_augment_sample20x3 <-do.call(rbind,results_lstm_augment_sample20x3)
files %>% length()
[1] 30

Augmenting 20% botnet 5x

We apply the shift sequence data-augmentation approach for generating 5x botnet and then generate the same amount of normal data.

files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-botnetx5-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x5 <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_augment_sample20x5 <-do.call(rbind,results_lstm_augment_sample20x5)
files %>% length()
[1] 30

Downsampling 20%

files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-downsample-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_downsample_sample20 <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_downsample_sample20 <-do.call(rbind,results_lstm_downsample_sample20)
files %>% length()
[1] 30

Upsampling 20%

files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-upsample-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_upsample_sample20 <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_upsample_sample20 <-do.call(rbind,results_lstm_upsample_sample20)
files %>% length()
[1] 30

Augmenting 20% botnet 2x from datasets

All previous augmenting data approaches were applied on the tokenized(keras) version of the sequences. This approach is applied directly on the dataset. THIS WAS THE APPROACH USED FOR GENERATING THE AUGMENTED train/test sets for MMCV and final CTU19A-augmented

files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-dataset-2x-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x2_dataset <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
                )
results_lstm_augment_sample20x2_dataset <-do.call(rbind,results_lstm_augment_sample20x2_dataset)
files %>% length()
[1] 30
results<-rbind(results_lstm_upsample_sample20 %>% tibble::add_column(imbalance="upsample-20"),
               results_lstm_downsample_sample20 %>% tibble::add_column(imbalance="downsample-20"), 
               #results_lstm_augment_sample20x2 %>% tibble::add_column(imbalance="aug-norm-bot-20x2"),
               #results_lstm_augment_sample20x3 %>% tibble::add_column(imbalance="aug-norm-bot-20x3"),
               #results_lstm_augment_sample20x5 %>% tibble::add_column(imbalance="aug-norm-bot-20x5"),
               results_lstm_augment_sample20x2_dataset %>% tibble::add_column(imbalance="aug-norm-bot-20x2-dat")
               
               
               #results_lstm_upsample %>% tibble::add_column(imbalance="upsample-10"),
               #results_lstm_downsample %>% tibble::add_column(imbalance="downsample-10"),
        
               #results_lstm_augment %>% tibble::add_column(imbalance="aug-norm-bot-10")
               
               )
results %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity")) %>%
  ggplot() +
  labs(title="Imbalance Strategies [30 executions]",
        subtitle="LSTM arch. according to Woodbridge et al. (2016):\nmaxlen 200, epochs 60",
        caption="Original sample contains:  ~3600 botnets and ~400 normal data points [20% of pop.]\n
       aug-norm-bot use a sliding window over a random sequence for generate new samples.\n
       x2 refers to different number of augmented samples.\n
       No significative differences observed between upsample and aug-norm-botx2
       ")+
  
  geom_boxplot(aes(x=as.factor(imbalance),y=value,fill=as.factor(imbalance)),color='gray')+
  xlab("Strategy")+
  theme_bw()+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position="none")+
  facet_wrap(~metric, scales = "fixed")

NORMAL DISTRIBUTION TESTS of the MMCV results

balanced_accuracy_augment20x3<-results_lstm_augment_sample20x3 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x5<-results_lstm_augment_sample20x5 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x2<-results_lstm_augment_sample20x2 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_upsample20<-results_lstm_upsample_sample20 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_downsample20<-results_lstm_downsample_sample20 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x2_dataset<-results_lstm_augment_sample20x2_dataset %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x2_dataset %>% select(value) %>%
  ggplot()+
  geom_histogram(aes(x=value),fill='skyblue',color='black')+
  ggdark::dark_theme_bw()

QQplots

CI

n<-balanced_accuracy_augment20x2_dataset %>% length()
mu<-mean(balanced_accuracy_augment20x2_dataset %>% unlist())
s<-sd(balanced_accuracy_augment20x2_dataset %>% unlist())
error<- qnorm(0.975)*s/sqrt(n)
c(mu+error,mu-error)
[1] 0.9541808 0.8885488

Wilcox Mann Whitney test

wilcox.test(balanced_accuracy_augment20x3 %>% unlist(),
            balanced_accuracy_augment20x2 %>% unlist()
            )

    Wilcoxon rank sum test

data:  balanced_accuracy_augment20x3 %>% unlist() and balanced_accuracy_augment20x2 %>% unlist()
W = 415, p-value = 0.6125
alternative hypothesis: true location shift is not equal to 0
augment_results<-data.frame(downsmaple=balanced_accuracy_downsample20 %>% unlist(),
           augment20x2_dataset=balanced_accuracy_augment20x2_dataset %>% unlist(),
          
           upsample=balanced_accuracy_upsample20 %>% unlist()
           #downsample=balanced_accuracy_downsample20 %>% unlist()
            
           ) %>% reshape2::melt() 
No id variables; using all as measure variables
oneway.test(value~variable,data=augment_results) %>% broom::tidy()
Multiple parameters; naming those columns num.df, den.df

MODEL SELECTION (MMCV)

LSTM

Batch 1024

files <- list.files(path = "../results/",pattern="results_test_lstm-endgame-augmented-ctu19-mccv-epochs=15-endgame-maxlen=1000-\\d+-lstm_size=\\d+-embedingdim=\\d+-dropout=[0-9.]+.csv")
results_lstm_tune <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout=([0-9]+.[0-9]).csv","\\1") ))
               %>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
          
                )
results_lstm_tune <-do.call(rbind,results_lstm_tune)
results_lstm_tune %>% group_by(sample) %>% summarise(n=n())
results_lstm_tune<-results_lstm_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_lstm_tune %>% ggplot()+
  geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
  labs(title="Parameters Tuning: LSTM Woodbridge (2016)",subtitle ="Parameteres: <lstm_size>_<embeding_size>_<dropout>")+
  xlab("Parameters")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position="none")+
  facet_wrap(~metric)

results_lstm_tune_ordered<-results_lstm_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value))  %>% arrange(desc(mean))
results_lstm_tune_ordered$parameters<-factor(results_lstm_tune_ordered$parameters,levels=unique(results_lstm_tune_ordered$parameters))
lstm_sd_plot<-  ggplot(aes(x=parameters,y=mean),data=results_lstm_tune_ordered)+
   geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
   geom_point(color='red')+
  ylab("Mean Balanced Accuracy")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
   ylim(0.80,0.95)+
   labs(title="Parameters Tuning: LSTM Woodbridge (2016)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=1024")
  
lstm_sd_plot

Batch 256

files <- list.files(path = "../results/",
                    pattern="results_test_lstm-endgame-augmented-ctu19-mccv-epochs=15-endgame-batch=256-maxlen=1000-\\d+-lstm_size=\\d+-embedingdim=\\d+-dropout=[0-9.]+.csv")
results_lstm_256_tune <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout=([0-9]+.[0-9]).csv","\\1") ))
               %>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
          
                )
results_lstm_256_tune <-do.call(rbind,results_lstm_256_tune)
results_lstm_256_tune %>% group_by(sample) %>% summarise(n=n())
results_lstm_256_tune<-results_lstm_256_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_lstm_256_tune_ordered<-results_lstm_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value))  %>% arrange(desc(mean))
results_lstm_256_tune_ordered$parameters<-factor(results_lstm_256_tune_ordered$parameters,levels=unique(results_lstm_256_tune_ordered$parameters))
lstm_256_sd_plot<-  ggplot(aes(x=parameters,y=mean),data=results_lstm_256_tune_ordered)+
   geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
   geom_point(color='red')+
  ylab("Mean Balanced Accuracy")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
   ylim(0.80,0.95)+
   labs(title="Parameters Tuning: LSTM Woodbridge (2016)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=256")
  
lstm_256_sd_plot

CNN1D

Batch 1024

files <- list.files(path = "../results/",pattern="results_test_cnn1d-cacic-augmented-ctu19-mccv-epochs=15-endgame-batch=1024-maxlen=1000-\\d+-nb_filter=\\d+-kernel_size=\\d+-embedingdim=\\d+-hidden_size=\\d+.csv")
results_cnn1d_tune <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(nb_filter=as.integer(str_replace(string = x ,pattern = ".*-nb_filter=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(kernel_size=as.numeric(str_replace(string = x ,pattern = ".*-kernel_size=([0-9]+)-.*.csv","\\1") ))
              %>% tibble::add_column(hidden_size=as.numeric(str_replace(string = x ,pattern = ".*-hidden_size=([0-9]+).csv","\\1") ))
              
               %>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
          
                )
results_cnn1d_tune <-do.call(rbind,results_cnn1d_tune)
results_cnn1d_tune %>% group_by(sample) %>% summarise(n=n())
results_cnn1d_tune<-results_cnn1d_tune %>% tidyr::unite("parameters",nb_filter:embedingdim:kernel_size:hidden_size) %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first usednumerical expression has 3 elements: only the first used
results_cnn1d_tune %>%  group_by(sample) %>% summarise(n=n())
results_cnn1d_tune %>% ggplot()+
  geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
  labs(title="Parameters Tuning: CNN1D Catania (2018)",subtitle ="Parameteres: <nb_filter>_<embeding_size>_<kernel_size>_<hidden_size>",caption="batch_size=1024")+
  xlab("Parameters")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position="none")+
  facet_wrap(~metric)

results_cnn1d_tune_ordered <- results_cnn1d_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_cnn1d_tune_ordered$parameters<-factor(results_cnn1d_tune_ordered$parameters,levels=unique(results_cnn1d_tune_ordered$parameters))
cnn1d_sd_plot<-  ggplot(aes(x=parameters,y=mean),data=results_cnn1d_tune_ordered)+
   geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
   geom_point(color='red')+
  ylab("Mean Balanced Accuracy")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
  theme(axis.text=element_text(size=8))+
   ylim(0.80,0.95)+
   labs(title="Parameters Tuning: CNN1D Catania (2018)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <nb_filter>_<embeding_size>_<kernel_size>_<hidden_size>",caption="batch_size=1024")
cnn1d_sd_plot

gridExtra::grid.arrange(lstm_sd_plot,cnn1d_sd_plot,ncol=2)

Batch 256

files <- list.files(path = "../results/",pattern="results_test_cnn1d-cacic-augmented-ctu19-mccv-epochs=15-endgame-maxlen=1000-\\d+-nb_filter=\\d+-kernel_size=\\d+-embedingdim=\\d+-hidden_size=\\d+.csv")
results_cnn1d_256_tune <- lapply(files, function(x) 
                read_csv(paste("../results/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(nb_filter=as.integer(str_replace(string = x ,pattern = ".*-nb_filter=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(kernel_size=as.numeric(str_replace(string = x ,pattern = ".*-kernel_size=([0-9]+)-.*.csv","\\1") ))
              %>% tibble::add_column(hidden_size=as.numeric(str_replace(string = x ,pattern = ".*-hidden_size=([0-9]+).csv","\\1") ))
              
               %>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
          
                )
results_cnn1d_256_tune <-do.call(rbind,results_cnn1d_256_tune)
results_cnn1d_256_tune %>% group_by(sample) %>% summarise(n=n())
results_cnn1d_256_tune<-results_cnn1d_256_tune %>% tidyr::unite("parameters",nb_filter:embedingdim:kernel_size:hidden_size) %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first usednumerical expression has 3 elements: only the first used
results_cnn1d_256_tune %>%  group_by(sample) %>% summarise(n=n())
results_cnn1d_256_tune_ordered <- results_cnn1d_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_cnn1d_256_tune_ordered$parameters<-factor(results_cnn1d_256_tune_ordered$parameters,levels=unique(results_cnn1d_256_tune_ordered$parameters))
cnn1d_256_sd_plot<-  ggplot(aes(x=parameters,y=mean),data=results_cnn1d_256_tune_ordered)+
   geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
   geom_point(color='red')+
  ylab("Mean Balanced Accuracy")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
   theme(axis.text=element_text(size=8))+
   ylim(0.80,0.95)+
   labs(title="Parameters Tuning: CNN1D Catania (2018)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <nb_filter>_<embeding_size>_<kernel_size>_<hidden_size>",caption="batch_size=256")
cnn1d_256_sd_plot

ATTENTION

Batch 1024

files <- list.files(path = "../results/juanma",pattern="results_test_awc-lstm_size%5C=\\d+-embedingdim%5C=\\d+-dropout%5C=[0-9.]+-metrics-\\d+.csv")
results_att_tune <- lapply(files, function(x) 
                read_csv(paste("../results/juanma/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size%5C=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim%5C=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout%5C=([0-9]+.[0-9])-metrics-\\d+.csv","\\1") ))
               %>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*metrics-(\\d+).csv","\\1") ))
          
                )
results_att_tune <-do.call(rbind,results_att_tune)
results_att_tune %>% group_by(sample) %>% summarise(n=n())
results_att_tune<-results_att_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_att_tune %>% ggplot()+
  geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
  labs(title="Parameters Tuning: Attention Yang (2016)",subtitle ="Parameters: <lstm_size>_<embeding_size>_<dropout>")+
  xlab("Parameters")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position="none")+
  facet_wrap(~metric)

results_att_tune_ordered<-results_att_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value))  %>% arrange(desc(mean))
results_att_tune_ordered$parameters<-factor(results_att_tune_ordered$parameters,levels=unique(results_att_tune_ordered$parameters))
att_sd_plot<-  ggplot(aes(x=parameters,y=mean),data=results_att_tune_ordered)+
   geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
   geom_point(color='red')+
  ylab("Mean Balanced Accuracy")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
   ylim(0.80,0.95)+
   labs(title="Parameters Tuning: Attention Yang (2016) ",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=1024")
  
att_sd_plot

Batch 256

files <- list.files(path = "../results/juanma",pattern="results_test_batch_size%5C=256-awc-lstm_size%5C=\\d+-embedingdim%5C=\\d+-dropout%5C=[0-9.]+-metrics-\\d+.csv")
results_att_256_tune <- lapply(files, function(x) 
                read_csv(paste("../results/juanma/",x,sep=""), col_types = cols())   
                %>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size%5C=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim%5C=([0-9]+)-.*.csv","\\1") ))
               %>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout%5C=([0-9]+.[0-9])-metrics-\\d+.csv","\\1") ))
               %>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*metrics-(\\d+).csv","\\1") ))
          
                )
results_att_256_tune <-do.call(rbind,results_att_256_tune)
results_att_256_tune %>% group_by(sample) %>% summarise(n=n())
results_att_256_tune<-results_att_256_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in%  c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_att_256_tune %>% ggplot()+
  geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
  labs(title="Parameters Tuning: Attention Yang (2016)",subtitle ="Parameters: <lstm_size>_<embeding_size>_<dropout>")+
  xlab("Parameters")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position="none")+
  facet_wrap(~metric)
Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().

results_att_256_tune_ordered<-results_att_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value))  %>% arrange(desc(mean))
results_att_256_tune_ordered$parameters<-factor(results_att_256_tune_ordered$parameters,levels=unique(results_att_256_tune_ordered$parameters))
att_256_sd_plot<-  ggplot(aes(x=parameters,y=mean),data=results_att_256tune_ordered)+
   geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
   geom_point(color='red')+
  ylab("Mean Balanced Accuracy")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
   ylim(0.80,0.95)+
   labs(title="Parameters Tuning: Attention Yang (2016) ",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=256")
  
att_256_sd_plot

Comparison

MCCV

gridplot<-gridExtra::grid.arrange(lstm_sd_plot+ theme(plot.title = element_text(size=8))
                                    + theme(plot.subtitle = element_text(size=6))
                                            
                        
                        ,lstm_256_sd_plot + theme(plot.title = element_text(size=8))
                                    + theme(plot.subtitle = element_text(size=6))
                        
                        ,att_sd_plot+ theme(plot.title = element_text(size=8))
                                    + theme(plot.subtitle = element_text(size=6))
                        
                        ,att_256_sd_plot+ theme(plot.title = element_text(size=8))
                                    + theme(plot.subtitle = element_text(size=6))
                        
                      
                        
                        ,cnn1d_sd_plot + theme(plot.title = element_text(size=8))
                                    + theme(plot.subtitle = element_text(size=6))
                        
                        ,cnn1d_256_sd_plot + theme(plot.title = element_text(size=8))
                                     + theme(plot.subtitle = element_text(size=6))
                        
                        ,ncol=6)

gridplot %>% plot()

Results Table

results_att_256_tune$model<-"att_256"
results_att_tune$model<-"att_1024"
results_cnn1d_256_tune$model<-"cnn1d_256"
results_cnn1d_tune$model<-"cnn1d_1024"  
results_lstm_256_tune$model<-"lstm_256"
results_lstm_tune$model<-"lstm"
rbind( results_att_256_tune,
results_att_tune,
results_cnn1d_256_tune,
results_cnn1d_tune,
results_lstm_256_tune,
results_lstm_tune) %>% readr::write_csv("~/sequence_classification_metrics.csv")
  

FINAL models results for CTU19A and CTU19B

Selected Models:

LSTM: batch 256 64_64_0.1 batch 1024 64_32_0.1

ATTENTION: Batch 256 32_32_0.1 Batch 1024 128_128_0.1

|CNN: Batch 256 128_128_4_256
Batch 1024 256_128_4_128

Results table (kabble)

att1_selected<-results_att_tune %>% filter(parameters=="128_128_0.1")
att1_selected$model<-"att1"
att2_selected<-results_att_256_tune %>% filter(parameters=="32_32_0.1")
att2_selected$model<-"att2"
lstm1_selected<-results_lstm_tune %>% filter(parameters=="64_32_0.1")
lstm1_selected$model<-"lstm1"
lstm2_selected<-results_lstm_256_tune %>% filter(parameters=="64_64_0.1")
lstm2_selected$model<-"lstm2"
cnn1d1_selected<-results_cnn1d_tune %>% filter(parameters=="256_128_4_128")
cnn1d1_selected$model<-"cnn1d1"
cnn1d2_selected<-results_cnn1d_256_tune %>% filter(parameters=="128_128_4_256")
cnn1d2_selected$model<-"cnn1d2"
results_final_models<-rbind( att1_selected,
       att2_selected,
       lstm1_selected,
       lstm2_selected,
       cnn1d1_selected,
       cnn1d2_selected
       ) %>% group_by(model,metric) %>% summarise(n=n(),
                                                  mean=mean(value),
                                                  sd=sd(value), 
                                                  se=sd/sqrt(n), 
                                                  ci=qt(p=0.025, df=n-1,lower.tail=F)*se)
`summarise()` has grouped output by 'model'. You can override using the `.groups` argument.
results_final_models
n<-balanced_accuracy_augment20x2_dataset %>% length()
mu<-mean(balanced_accuracy_augment20x2_dataset %>% unlist())
s<-sd(balanced_accuracy_augment20x2_dataset %>% unlist())
error<- qnorm(0.975)*s/sqrt(n)
c(mu+error,mu-error)
[1] 0.9541808 0.8885488

Selected models CI (confidence intervals) vs final results on CTU19B

#results_att_256_tune_ordered<-results_att_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>%  summarise(mean=mean(value),sd=sd(value))  %>% arrange(desc(mean))
#results_att_256_tune_ordered$parameters<-factor(results_att_256_tune_ordered$parameters,levels=unique(results_att_256_tune_ordered$parameters))
final_models_plot<-  ggplot(aes(x=model,y=mean),data=results_final_models)+
  facet_wrap(~metric)+
   geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), width=.2,color='yellow')+
  
   geom_point(color='red')+
   geom_point(aes(y=value), data=tbl_results_final_models, color='green')+
  
  ylab("Model")+
    ggdark::dark_theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
   theme(legend.position="none")+
   #ylim(0.80,0.95)+
   labs(title="SELECTED MODELS: Variation vs. Final Results on CTU19B",subtitle ="Metrics: Mean (in red) and CI (in yellow). Green: result on CTU19B",caption="")
final_models_plot

