library(readr)
library(dplyr)
library(broom)
library(stringr)
library(ggplot2)
We have tried several approaches for dealing with imbalaced classes. We have used the Woodbridge (2016) LSTM network for evaluating the different imbalaced approaches.
The following sections presents the results of the different imabalance techniques on samples of 10% of the CTU19A dataset ### Downsampling
files <- list.files(path = "../results/",pattern="results_test_imbalance-downsampling-epochs=60-endgame-maxlen=200-\\d+")
#str_replace(string = "results_test_ctu13-lstm_endgame-80-1.csv",pattern = ".*-([0-9]+)-[0-9]+.csv","\\1")
results_lstm_downsample <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_downsample <-do.call(rbind,results_lstm_downsample)
files <- list.files(path = "../results/",pattern="results_test_imbalance-upsampling-epochs=60-endgame-maxlen=200-\\d+")
#str_replace(string = "results_test_ctu13-lstm_endgame-80-1.csv",pattern = ".*-([0-9]+)-[0-9]+.csv","\\1")
results_lstm_upsample <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""),col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_upsample <-do.call(rbind,results_lstm_upsample)
files <- list.files(path = "../results/",pattern="results_test_imbalance-augmenting-epochs=60-endgame-maxlen=200-\\d+")
#str_replace(string = "results_test_ctu13-lstm_endgame-80-1.csv",pattern = ".*-([0-9]+)-[0-9]+.csv","\\1")
results_lstm_augment <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""),col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_augment <-do.call(rbind,results_lstm_augment)
files <- list.files(path = "../results/",pattern="results_test_imbalance-augmenting-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment4x <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""),col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_augment4x <-do.call(rbind,results_lstm_augment4x)
files %>% length()
[1] 30
results<-rbind(results_lstm_upsample %>% tibble::add_column(imbalance="upsample"),
results_lstm_downsample %>% tibble::add_column(imbalance="downsample"),
results_lstm_augment %>% tibble::add_column(imbalance="aug-norm"),
results_lstm_augment4x %>% tibble::add_column(imbalance="aug-norm-bot"))
results %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity")) %>%
ggplot() +
labs(title="Imbalance Strategies [30 executions]",
subtitle="LSTM arch. according to Woodbridge et al. (2016):\nmaxlen 200, epochs 60",
caption="Original sample contains: ~1800 botnets and ~180 normal data points.\n
aug-norm and aug-norm-bot use a sliding window over sequence for generate new samples. ")+
geom_boxplot(aes(x=as.factor(imbalance),y=value,fill=as.factor(imbalance)),color='gray')+
xlab("Strategy")+
theme_bw()+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
facet_wrap(~metric, scales = "fixed")
The following sections presents the results of the different imabalance techniques on samples of 10% of the CTU19A dataset. The 20% was choosen based on a carefull analisys of the differences between pop. and sample distribution.
We apply the shift sequence data-augmentation approach for generating 2x botnet and then generate the same amount of normal data.
files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-botnetx2-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x2 <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_augment_sample20x2 <-do.call(rbind,results_lstm_augment_sample20x2)
files %>% length()
[1] 30
We apply the shift sequence data-augmentation approach for generating 1x botnet and then generate the same amount of normal data.
files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x3 <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_augment_sample20x3 <-do.call(rbind,results_lstm_augment_sample20x3)
files %>% length()
[1] 30
We apply the shift sequence data-augmentation approach for generating 5x botnet and then generate the same amount of normal data.
files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-botnetx5-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x5 <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_augment_sample20x5 <-do.call(rbind,results_lstm_augment_sample20x5)
files %>% length()
[1] 30
files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-downsample-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_downsample_sample20 <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_downsample_sample20 <-do.call(rbind,results_lstm_downsample_sample20)
files %>% length()
[1] 30
files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-upsample-botnet-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_upsample_sample20 <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_upsample_sample20 <-do.call(rbind,results_lstm_upsample_sample20)
files %>% length()
[1] 30
All previous augmenting data approaches were applied on the tokenized(keras) version of the sequences. This approach is applied directly on the dataset. THIS WAS THE APPROACH USED FOR GENERATING THE AUGMENTED train/test sets for MMCV and final CTU19A-augmented
files <- list.files(path = "../results/",pattern="results_test_imbalance-sample20-augment-dataset-2x-epochs=60-endgame-maxlen=200-\\d+")
results_lstm_augment_sample20x2_dataset <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(maxlen=as.integer(str_replace(string = x ,pattern = ".*=([0-9]+)-[0-9]+.csv","\\1") ))
)
results_lstm_augment_sample20x2_dataset <-do.call(rbind,results_lstm_augment_sample20x2_dataset)
files %>% length()
[1] 30
results<-rbind(results_lstm_upsample_sample20 %>% tibble::add_column(imbalance="upsample-20"),
results_lstm_downsample_sample20 %>% tibble::add_column(imbalance="downsample-20"),
#results_lstm_augment_sample20x2 %>% tibble::add_column(imbalance="aug-norm-bot-20x2"),
#results_lstm_augment_sample20x3 %>% tibble::add_column(imbalance="aug-norm-bot-20x3"),
#results_lstm_augment_sample20x5 %>% tibble::add_column(imbalance="aug-norm-bot-20x5"),
results_lstm_augment_sample20x2_dataset %>% tibble::add_column(imbalance="aug-norm-bot-20x2-dat")
#results_lstm_upsample %>% tibble::add_column(imbalance="upsample-10"),
#results_lstm_downsample %>% tibble::add_column(imbalance="downsample-10"),
#results_lstm_augment %>% tibble::add_column(imbalance="aug-norm-bot-10")
)
results %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity")) %>%
ggplot() +
labs(title="Imbalance Strategies [30 executions]",
subtitle="LSTM arch. according to Woodbridge et al. (2016):\nmaxlen 200, epochs 60",
caption="Original sample contains: ~3600 botnets and ~400 normal data points [20% of pop.]\n
aug-norm-bot use a sliding window over a random sequence for generate new samples.\n
x2 refers to different number of augmented samples.\n
No significative differences observed between upsample and aug-norm-botx2
")+
geom_boxplot(aes(x=as.factor(imbalance),y=value,fill=as.factor(imbalance)),color='gray')+
xlab("Strategy")+
theme_bw()+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
facet_wrap(~metric, scales = "fixed")
balanced_accuracy_augment20x3<-results_lstm_augment_sample20x3 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x5<-results_lstm_augment_sample20x5 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x2<-results_lstm_augment_sample20x2 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_upsample20<-results_lstm_upsample_sample20 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_downsample20<-results_lstm_downsample_sample20 %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x2_dataset<-results_lstm_augment_sample20x2_dataset %>% filter(metric=="Balanced Accuracy") %>% select(value)
balanced_accuracy_augment20x2_dataset %>% select(value) %>%
ggplot()+
geom_histogram(aes(x=value),fill='skyblue',color='black')+
ggdark::dark_theme_bw()
n<-balanced_accuracy_augment20x2_dataset %>% length()
mu<-mean(balanced_accuracy_augment20x2_dataset %>% unlist())
s<-sd(balanced_accuracy_augment20x2_dataset %>% unlist())
error<- qnorm(0.975)*s/sqrt(n)
c(mu+error,mu-error)
[1] 0.9541808 0.8885488
wilcox.test(balanced_accuracy_augment20x3 %>% unlist(),
balanced_accuracy_augment20x2 %>% unlist()
)
Wilcoxon rank sum test
data: balanced_accuracy_augment20x3 %>% unlist() and balanced_accuracy_augment20x2 %>% unlist()
W = 415, p-value = 0.6125
alternative hypothesis: true location shift is not equal to 0
augment_results<-data.frame(downsmaple=balanced_accuracy_downsample20 %>% unlist(),
augment20x2_dataset=balanced_accuracy_augment20x2_dataset %>% unlist(),
upsample=balanced_accuracy_upsample20 %>% unlist()
#downsample=balanced_accuracy_downsample20 %>% unlist()
) %>% reshape2::melt()
No id variables; using all as measure variables
oneway.test(value~variable,data=augment_results) %>% broom::tidy()
Multiple parameters; naming those columns num.df, den.df
files <- list.files(path = "../results/",pattern="results_test_lstm-endgame-augmented-ctu19-mccv-epochs=15-endgame-maxlen=1000-\\d+-lstm_size=\\d+-embedingdim=\\d+-dropout=[0-9.]+.csv")
results_lstm_tune <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout=([0-9]+.[0-9]).csv","\\1") ))
%>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
)
results_lstm_tune <-do.call(rbind,results_lstm_tune)
results_lstm_tune %>% group_by(sample) %>% summarise(n=n())
results_lstm_tune<-results_lstm_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_lstm_tune %>% ggplot()+
geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
labs(title="Parameters Tuning: LSTM Woodbridge (2016)",subtitle ="Parameteres: <lstm_size>_<embeding_size>_<dropout>")+
xlab("Parameters")+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
facet_wrap(~metric)
results_lstm_tune_ordered<-results_lstm_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_lstm_tune_ordered$parameters<-factor(results_lstm_tune_ordered$parameters,levels=unique(results_lstm_tune_ordered$parameters))
lstm_sd_plot<- ggplot(aes(x=parameters,y=mean),data=results_lstm_tune_ordered)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
geom_point(color='red')+
ylab("Mean Balanced Accuracy")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
ylim(0.80,0.95)+
labs(title="Parameters Tuning: LSTM Woodbridge (2016)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=1024")
lstm_sd_plot
files <- list.files(path = "../results/",
pattern="results_test_lstm-endgame-augmented-ctu19-mccv-epochs=15-endgame-batch=256-maxlen=1000-\\d+-lstm_size=\\d+-embedingdim=\\d+-dropout=[0-9.]+.csv")
results_lstm_256_tune <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout=([0-9]+.[0-9]).csv","\\1") ))
%>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
)
results_lstm_256_tune <-do.call(rbind,results_lstm_256_tune)
results_lstm_256_tune %>% group_by(sample) %>% summarise(n=n())
results_lstm_256_tune<-results_lstm_256_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_lstm_256_tune_ordered<-results_lstm_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_lstm_256_tune_ordered$parameters<-factor(results_lstm_256_tune_ordered$parameters,levels=unique(results_lstm_256_tune_ordered$parameters))
lstm_256_sd_plot<- ggplot(aes(x=parameters,y=mean),data=results_lstm_256_tune_ordered)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
geom_point(color='red')+
ylab("Mean Balanced Accuracy")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
ylim(0.80,0.95)+
labs(title="Parameters Tuning: LSTM Woodbridge (2016)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=256")
lstm_256_sd_plot
files <- list.files(path = "../results/",pattern="results_test_cnn1d-cacic-augmented-ctu19-mccv-epochs=15-endgame-batch=1024-maxlen=1000-\\d+-nb_filter=\\d+-kernel_size=\\d+-embedingdim=\\d+-hidden_size=\\d+.csv")
results_cnn1d_tune <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(nb_filter=as.integer(str_replace(string = x ,pattern = ".*-nb_filter=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(kernel_size=as.numeric(str_replace(string = x ,pattern = ".*-kernel_size=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(hidden_size=as.numeric(str_replace(string = x ,pattern = ".*-hidden_size=([0-9]+).csv","\\1") ))
%>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
)
results_cnn1d_tune <-do.call(rbind,results_cnn1d_tune)
results_cnn1d_tune %>% group_by(sample) %>% summarise(n=n())
results_cnn1d_tune<-results_cnn1d_tune %>% tidyr::unite("parameters",nb_filter:embedingdim:kernel_size:hidden_size) %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first usednumerical expression has 3 elements: only the first used
results_cnn1d_tune %>% group_by(sample) %>% summarise(n=n())
results_cnn1d_tune %>% ggplot()+
geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
labs(title="Parameters Tuning: CNN1D Catania (2018)",subtitle ="Parameteres: <nb_filter>_<embeding_size>_<kernel_size>_<hidden_size>",caption="batch_size=1024")+
xlab("Parameters")+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
facet_wrap(~metric)
results_cnn1d_tune_ordered <- results_cnn1d_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_cnn1d_tune_ordered$parameters<-factor(results_cnn1d_tune_ordered$parameters,levels=unique(results_cnn1d_tune_ordered$parameters))
cnn1d_sd_plot<- ggplot(aes(x=parameters,y=mean),data=results_cnn1d_tune_ordered)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
geom_point(color='red')+
ylab("Mean Balanced Accuracy")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
theme(axis.text=element_text(size=8))+
ylim(0.80,0.95)+
labs(title="Parameters Tuning: CNN1D Catania (2018)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <nb_filter>_<embeding_size>_<kernel_size>_<hidden_size>",caption="batch_size=1024")
cnn1d_sd_plot
gridExtra::grid.arrange(lstm_sd_plot,cnn1d_sd_plot,ncol=2)
files <- list.files(path = "../results/",pattern="results_test_cnn1d-cacic-augmented-ctu19-mccv-epochs=15-endgame-maxlen=1000-\\d+-nb_filter=\\d+-kernel_size=\\d+-embedingdim=\\d+-hidden_size=\\d+.csv")
results_cnn1d_256_tune <- lapply(files, function(x)
read_csv(paste("../results/",x,sep=""), col_types = cols())
%>% tibble::add_column(nb_filter=as.integer(str_replace(string = x ,pattern = ".*-nb_filter=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(kernel_size=as.numeric(str_replace(string = x ,pattern = ".*-kernel_size=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(hidden_size=as.numeric(str_replace(string = x ,pattern = ".*-hidden_size=([0-9]+).csv","\\1") ))
%>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*=1000-(\\d+)-.*.csv","\\1") ))
)
results_cnn1d_256_tune <-do.call(rbind,results_cnn1d_256_tune)
results_cnn1d_256_tune %>% group_by(sample) %>% summarise(n=n())
results_cnn1d_256_tune<-results_cnn1d_256_tune %>% tidyr::unite("parameters",nb_filter:embedingdim:kernel_size:hidden_size) %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first usednumerical expression has 3 elements: only the first used
results_cnn1d_256_tune %>% group_by(sample) %>% summarise(n=n())
results_cnn1d_256_tune_ordered <- results_cnn1d_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_cnn1d_256_tune_ordered$parameters<-factor(results_cnn1d_256_tune_ordered$parameters,levels=unique(results_cnn1d_256_tune_ordered$parameters))
cnn1d_256_sd_plot<- ggplot(aes(x=parameters,y=mean),data=results_cnn1d_256_tune_ordered)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
geom_point(color='red')+
ylab("Mean Balanced Accuracy")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
theme(axis.text=element_text(size=8))+
ylim(0.80,0.95)+
labs(title="Parameters Tuning: CNN1D Catania (2018)",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <nb_filter>_<embeding_size>_<kernel_size>_<hidden_size>",caption="batch_size=256")
cnn1d_256_sd_plot
files <- list.files(path = "../results/juanma",pattern="results_test_awc-lstm_size%5C=\\d+-embedingdim%5C=\\d+-dropout%5C=[0-9.]+-metrics-\\d+.csv")
results_att_tune <- lapply(files, function(x)
read_csv(paste("../results/juanma/",x,sep=""), col_types = cols())
%>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size%5C=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim%5C=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout%5C=([0-9]+.[0-9])-metrics-\\d+.csv","\\1") ))
%>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*metrics-(\\d+).csv","\\1") ))
)
results_att_tune <-do.call(rbind,results_att_tune)
results_att_tune %>% group_by(sample) %>% summarise(n=n())
results_att_tune<-results_att_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_att_tune %>% ggplot()+
geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
labs(title="Parameters Tuning: Attention Yang (2016)",subtitle ="Parameters: <lstm_size>_<embeding_size>_<dropout>")+
xlab("Parameters")+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
facet_wrap(~metric)
results_att_tune_ordered<-results_att_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_att_tune_ordered$parameters<-factor(results_att_tune_ordered$parameters,levels=unique(results_att_tune_ordered$parameters))
att_sd_plot<- ggplot(aes(x=parameters,y=mean),data=results_att_tune_ordered)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
geom_point(color='red')+
ylab("Mean Balanced Accuracy")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
ylim(0.80,0.95)+
labs(title="Parameters Tuning: Attention Yang (2016) ",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=1024")
att_sd_plot
files <- list.files(path = "../results/juanma",pattern="results_test_batch_size%5C=256-awc-lstm_size%5C=\\d+-embedingdim%5C=\\d+-dropout%5C=[0-9.]+-metrics-\\d+.csv")
results_att_256_tune <- lapply(files, function(x)
read_csv(paste("../results/juanma/",x,sep=""), col_types = cols())
%>% tibble::add_column(lstm_size=as.integer(str_replace(string = x ,pattern = ".*-lstm_size%5C=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(embedingdim=as.integer(str_replace(string = x ,pattern = ".*-embedingdim%5C=([0-9]+)-.*.csv","\\1") ))
%>% tibble::add_column(dropout=as.numeric(str_replace(string = x ,pattern = ".*-dropout%5C=([0-9]+.[0-9])-metrics-\\d+.csv","\\1") ))
%>% tibble::add_column(sample=as.factor(str_replace(string = x ,pattern = ".*metrics-(\\d+).csv","\\1") ))
)
results_att_256_tune <-do.call(rbind,results_att_256_tune)
results_att_256_tune %>% group_by(sample) %>% summarise(n=n())
results_att_256_tune<-results_att_256_tune %>% tidyr::unite("parameters",lstm_size:embedingdim:dropout) %>% filter(metric %in% c("Balanced Accuracy","F1","Sensitivity","Specificity"))
numerical expression has 2 elements: only the first used
results_att_256_tune %>% ggplot()+
geom_boxplot(aes(y=value,x=parameters,fill=parameters),color='gray')+
labs(title="Parameters Tuning: Attention Yang (2016)",subtitle ="Parameters: <lstm_size>_<embeding_size>_<dropout>")+
xlab("Parameters")+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
facet_wrap(~metric)
Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().
results_att_256_tune_ordered<-results_att_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
results_att_256_tune_ordered$parameters<-factor(results_att_256_tune_ordered$parameters,levels=unique(results_att_256_tune_ordered$parameters))
att_256_sd_plot<- ggplot(aes(x=parameters,y=mean),data=results_att_256tune_ordered)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,color='yellow')+
geom_point(color='red')+
ylab("Mean Balanced Accuracy")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
ylim(0.80,0.95)+
labs(title="Parameters Tuning: Attention Yang (2016) ",subtitle ="Balanced Accuracy: Mean and Standard Deviation\nParameteres: <lstm_size>_<embeding_size>_<dropout>",caption="batch_size=256")
att_256_sd_plot
gridplot<-gridExtra::grid.arrange(lstm_sd_plot+ theme(plot.title = element_text(size=8))
+ theme(plot.subtitle = element_text(size=6))
,lstm_256_sd_plot + theme(plot.title = element_text(size=8))
+ theme(plot.subtitle = element_text(size=6))
,att_sd_plot+ theme(plot.title = element_text(size=8))
+ theme(plot.subtitle = element_text(size=6))
,att_256_sd_plot+ theme(plot.title = element_text(size=8))
+ theme(plot.subtitle = element_text(size=6))
,cnn1d_sd_plot + theme(plot.title = element_text(size=8))
+ theme(plot.subtitle = element_text(size=6))
,cnn1d_256_sd_plot + theme(plot.title = element_text(size=8))
+ theme(plot.subtitle = element_text(size=6))
,ncol=6)
gridplot %>% plot()
results_att_256_tune$model<-"att_256"
results_att_tune$model<-"att_1024"
results_cnn1d_256_tune$model<-"cnn1d_256"
results_cnn1d_tune$model<-"cnn1d_1024"
results_lstm_256_tune$model<-"lstm_256"
results_lstm_tune$model<-"lstm"
rbind( results_att_256_tune,
results_att_tune,
results_cnn1d_256_tune,
results_cnn1d_tune,
results_lstm_256_tune,
results_lstm_tune) %>% readr::write_csv("~/sequence_classification_metrics.csv")
Selected Models:
LSTM: batch 256 64_64_0.1 batch 1024 64_32_0.1
ATTENTION: Batch 256 32_32_0.1 Batch 1024 128_128_0.1
|CNN: Batch 256 128_128_4_256
Batch 1024 256_128_4_128
att1_selected<-results_att_tune %>% filter(parameters=="128_128_0.1")
att1_selected$model<-"att1"
att2_selected<-results_att_256_tune %>% filter(parameters=="32_32_0.1")
att2_selected$model<-"att2"
lstm1_selected<-results_lstm_tune %>% filter(parameters=="64_32_0.1")
lstm1_selected$model<-"lstm1"
lstm2_selected<-results_lstm_256_tune %>% filter(parameters=="64_64_0.1")
lstm2_selected$model<-"lstm2"
cnn1d1_selected<-results_cnn1d_tune %>% filter(parameters=="256_128_4_128")
cnn1d1_selected$model<-"cnn1d1"
cnn1d2_selected<-results_cnn1d_256_tune %>% filter(parameters=="128_128_4_256")
cnn1d2_selected$model<-"cnn1d2"
results_final_models<-rbind( att1_selected,
att2_selected,
lstm1_selected,
lstm2_selected,
cnn1d1_selected,
cnn1d2_selected
) %>% group_by(model,metric) %>% summarise(n=n(),
mean=mean(value),
sd=sd(value),
se=sd/sqrt(n),
ci=qt(p=0.025, df=n-1,lower.tail=F)*se)
`summarise()` has grouped output by 'model'. You can override using the `.groups` argument.
results_final_models
n<-balanced_accuracy_augment20x2_dataset %>% length()
mu<-mean(balanced_accuracy_augment20x2_dataset %>% unlist())
s<-sd(balanced_accuracy_augment20x2_dataset %>% unlist())
error<- qnorm(0.975)*s/sqrt(n)
c(mu+error,mu-error)
[1] 0.9541808 0.8885488
#results_att_256_tune_ordered<-results_att_256_tune %>% filter(metric=="Balanced Accuracy") %>% select(value,parameters) %>% group_by(parameters) %>% summarise(mean=mean(value),sd=sd(value)) %>% arrange(desc(mean))
#results_att_256_tune_ordered$parameters<-factor(results_att_256_tune_ordered$parameters,levels=unique(results_att_256_tune_ordered$parameters))
final_models_plot<- ggplot(aes(x=model,y=mean),data=results_final_models)+
facet_wrap(~metric)+
geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), width=.2,color='yellow')+
geom_point(color='red')+
geom_point(aes(y=value), data=tbl_results_final_models, color='green')+
ylab("Model")+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none")+
#ylim(0.80,0.95)+
labs(title="SELECTED MODELS: Variation vs. Final Results on CTU19B",subtitle ="Metrics: Mean (in red) and CI (in yellow). Green: result on CTU19B",caption="")
final_models_plot