Selecting CNN best_model
Executing Rscript evaluate_dga_classifier.R --modelid=1 --tune --experimenttag=cnn_tune_sanson
Listing available datasets
ls -1 datasets/JISA2018*.csv.gz
datasets/JISA2018.csv.gz
datasets/JISA2018-cz-dataset.csv.gz
datasets/JISA2018-dganormal-dataset.csv.gz
datasets/JISA2018-dganormal-dataset-full.csv.gz
datasets/JISA2018-std-dataset.csv.gz
The standard-dataset refers to the dataset used in the argencon paper (OSINT+ALEXA)
The dataset do not contain neither .cz domains nor normaldga
Creating the model using standard datasets (i.e. OSINT and ALEXA)
Preparing dataset
source("preprocess.R")
datasets<-build_train_test(datasetfile = "datasets/JISA2018-std-dataset.csv.gz",maxlen = 45)
selected_parameters=list(
nb_filter = 128,
kernel_size = 8,
embedingdim = 100,
hidden_size = 1024
)
apply 5-folds Cross validation
result_cv$result
result_cv$result %>% filter(metric=="Sensitivity" | metric == "Specificity" | metric=="F1" | metric=="Balanced Accuracy") %>%
ggplot()+
geom_boxplot(aes(x=metric,y=value,fill=metric))+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Applying a 2-fold evaluation (train/test)
Required for saving the model and use it in further experiments
summary(std_dataset_model)
____________________________________________________________________________________________________________________________________________________________________________
Layer (type) Output Shape Param #
============================================================================================================================================================================
input_1 (InputLayer) (None, 45) 0
____________________________________________________________________________________________________________________________________________________________________________
embedding_1 (Embedding) (None, 45, 100) 4000
____________________________________________________________________________________________________________________________________________________________________________
conv1d_1 (Conv1D) (None, 38, 128) 102528
____________________________________________________________________________________________________________________________________________________________________________
flatten_1 (Flatten) (None, 4864) 0
____________________________________________________________________________________________________________________________________________________________________________
dense_1 (Dense) (None, 1024) 4981760
____________________________________________________________________________________________________________________________________________________________________________
dense_2 (Dense) (None, 1) 1025
============================================================================================================================================================================
Total params: 5,089,313
Trainable params: 5,089,313
Non-trainable params: 0
____________________________________________________________________________________________________________________________________________________________________________
Testing on normal cz domains (i.e. all domains are normal)
cz_dataset<-read_csv("./datasets/JISA2018-cz-dataset.csv.gz")
cz_dataset %>% group_by(label) %>% summarise(n=n())
cz_dataset$domain1<-str_split(cz_dataset$domain,"\\.",simplify = T)[,1]
cz_dataset_tokenized<-build_dataset(as.matrix(cz_dataset),maxlen=45)
cz_dataset_x<-cz_dataset_tokenized$encode
cz_dataset_y<-ifelse(grepl("normal",cz_dataset_tokenized$label) ,0,1)
Calcualting Recall per class (0,1)
preds<-get_predictions(std_dataset_model,cz_dataset_x,threshold = 0.9 )
table(preds) %>% data.frame() %>% mutate(total=sum(Freq),recall=Freq/total) %>% select(-total,-Freq) %>% reshape2::melt() %>%
ggplot()+
geom_col(aes(x=variable,y=value,fill=preds))+
theme_bw()
Using preds as id variables

Printing Domains Not correctly detected as normal
data.frame(predicted_class=preds,class=cz_dataset_y,domain=cz_dataset_tokenized$domain) %>% filter (predicted_class==1) %>% select(domain)
Testing on normal dga domains
dga_normal_dataset<-read_csv("./datasets/JISA2018-dganormal-dataset-full.csv.gz")
dga_normal_dataset %>% group_by(label) %>% summarise(n=n())
dga_normal_dataset$domain1<-str_split(dga_normal_dataset$domain,"\\.",simplify = T)[,1]
dga_normal_dataset_tokenized<-build_dataset(as.matrix(dga_normal_dataset),maxlen=45)
dga_normal_dataset_x<-dga_normal_dataset_tokenized$encode
dga_normal_dataset_y<-ifelse(grepl("normal",dga_normal_dataset_tokenized$label) ,0,1)
Calculating (ploting) Recall per class (0,1)
preds<-get_predictions(std_dataset_model,dga_normal_dataset_x,threshold = 0.9 )
table(preds) %>% data.frame() %>% mutate(total=sum(Freq),recall=Freq/total) %>% select(-total,-Freq) %>% reshape2::melt() %>%
ggplot()+
geom_col(aes(x=variable,y=value,fill=preds))+
theme_bw()
Using preds as id variables

Printing Domains Not correctly detected as normal
Plotin recall by normalDga type
calculate_recall(data.frame(label=dga_normal_dataset_tokenized$label, class=dga_normal_dataset_y,predicted_class=preds))%>%
ggplot()+
geom_col(aes(x=label,y=recall,fill=label))+
theme_bw()+ylab("Sensitivity (AKA Recall)")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
guides(color=FALSE)

LS0tCnRpdGxlOiAiSklTQSAyMDE4IgpvdXRwdXQ6IAogIGh0bWxfbm90ZWJvb2s6IAogICAgY29kZV9mb2xkaW5nOiBoaWRlCi0tLQoKYGBge3IgY3JlYXRlIG5vcm1hbGRnYSBkYXRhc2V0LCBldmFsPUZBTFNFLCBpbmNsdWRlPUZBTFNFfQojdmltIHJlZ2V4cHMKIyAlcy9cKFx3XCtcLlx3XCtcLiRcKS9cMVx0XDEvZyAoc2VwYXJhdGUgdG9wbGV2ZWwgZG9tYWlucykKIyAlcy9cKFx3XCtcKVwuXHdcKyQvbm9ybWFsLlwxL2cgKGNyYXRlIG5vcm1hbCBsYWJlbCkKCmxpYnJhcnkobHVicmlkYXRlKW5vcm1hbGRnYV9yYXc8LXJlYWRfZGVsaW0oImRhdGFzZXRzL05vcm1hbERHQS9XaGFsZWJvbmUvZG9tYWluc19hbGwiLCBkZWxpbT0iXHQiLCBjb2xfbmFtZXMgPSBGKQpuYW1lcyhub3JtYWxkZ2FfcmF3KTwtYygiZG9tYWluIiwibGFiZWwiKQpub3JtYWxkZ2FfcmF3ICU+JSBncm91cF9ieShsYWJlbCkgJT4lIHN1bW1hcmlzZShuPW4oKSkKbm9ybWFsZGdhX3JhdzwtY2JpbmQobm9ybWFsZGdhX3JhdyxkYXRlPXRvZGF5KCksaWQ9c2VxKDEwMjQsMTAyMytucm93KG5vcm1hbGRnYV9yYXcpKSkKd3JpdGVfY3N2KG5vcm1hbGRnYV9yYXcscGF0aCA9ICJkYXRhc2V0cy9KSVNBMjAxOC1kZ2Fub3JtYWwtZGF0YXNldC1mdWxsLmNzdiIpCgpgYGAKCgojIFNlbGVjdGluZyBDTk4gYmVzdF9tb2RlbApFeGVjdXRpbmcgYFJzY3JpcHQgZXZhbHVhdGVfZGdhX2NsYXNzaWZpZXIuUiAtLW1vZGVsaWQ9MSAtLXR1bmUgIC0tZXhwZXJpbWVudHRhZz1jbm5fdHVuZV9zYW5zb24gYApgYGB7cn0KcmVzPC1yZWFkX2NzdigicmVzdWx0cy9yZXN1bHRzX3R1bmluZ19jbm5fdHVuZV9zYW5zb24uY3N2IikKcmVzICU+JSBhcnJhbmdlKGRlc2MoRjEpKSAlPiUgc2VsZWN0KEYxKSAlPiUgaGVhZCgxKQpgYGAKCiMgTGlzdGluZyBhdmFpbGFibGUgZGF0YXNldHMKYGBge2Jhc2ggfQpscyAtMSBkYXRhc2V0cy9KSVNBMjAxOCouY3N2Lmd6CmBgYAoKIyBUaGUgc3RhbmRhcmQtZGF0YXNldCByZWZlcnMgdG8gdGhlIGRhdGFzZXQgdXNlZCBpbiB0aGUgYXJnZW5jb24gcGFwZXIgKE9TSU5UK0FMRVhBKQpUaGUgZGF0YXNldCBkbyBub3QgY29udGFpbiBuZWl0aGVyIC5jeiBkb21haW5zIG5vciBub3JtYWxkZ2EKYGBge3J9CnN0ZF9kYXRhc2V0X2NoZWNrPC1yZWFkX2NzdigiZGF0YXNldHMvSklTQTIwMTgtc3RkLWRhdGFzZXQuY3N2Lmd6IikKc3RkX2RhdGFzZXRfY2hlY2sgJT4lIGdyb3VwX2J5KGxhYmVsKSAlPiUgc3VtbWFyaXNlKG49bigpKQpgYGAKCgojIENyZWF0aW5nIHRoZSBtb2RlbCB1c2luZyBzdGFuZGFyZCBkYXRhc2V0cyAoaS5lLiBPU0lOVCBhbmQgQUxFWEEpCiMjIFByZXBhcmluZyBkYXRhc2V0CmBgYHtyfQpzb3VyY2UoInByZXByb2Nlc3MuUiIpCmRhdGFzZXRzPC1idWlsZF90cmFpbl90ZXN0KGRhdGFzZXRmaWxlID0gImRhdGFzZXRzL0pJU0EyMDE4LXN0ZC1kYXRhc2V0LmNzdi5neiIsbWF4bGVuID0gNDUpCmBgYAoKYGBge3J9CnNlbGVjdGVkX3BhcmFtZXRlcnM9bGlzdCgKICBuYl9maWx0ZXIgPSAxMjgsCiAga2VybmVsX3NpemUgPSA4LAogIGVtYmVkaW5nZGltID0gMTAwLAogIGhpZGRlbl9zaXplID0gMTAyNAopCgpgYGAKCiMjIGFwcGx5IDUtZm9sZHMgQ3Jvc3MgdmFsaWRhdGlvbgpgYGB7cn0KcmVzdWx0X2N2PC1ldmFsdWF0ZV9tb2RlbF9jdihkYXRhID0gZGF0YXNldHMkdHJhaW4sayA9IDUgLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIG1vZGVsZnVuID0ga2VyYXNfbW9kZWxfY25uX2FyZ2VuY29uLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIGV4cGVyaW1lbnRuYW1lID0gInN0ZC1kYXRhc2V0IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBtb2RlbF9wYXJhbWV0ZXJzID0gc2VsZWN0ZWRfcGFyYW1ldGVycykKCnJlc3VsdF9jdiRyZXN1bHQgCnJlc3VsdF9jdiRyZXN1bHQgJT4lICBmaWx0ZXIobWV0cmljPT0iU2Vuc2l0aXZpdHkiIHwgbWV0cmljID09ICJTcGVjaWZpY2l0eSIgfCBtZXRyaWM9PSJGMSIgfCBtZXRyaWM9PSJCYWxhbmNlZCBBY2N1cmFjeSIpICU+JQogIGdncGxvdCgpKwogIGdlb21fYm94cGxvdChhZXMoeD1tZXRyaWMseT12YWx1ZSxmaWxsPW1ldHJpYykpKwogIHRoZW1lX2J3KCkrCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3QgPSAxKSkKYGBgCgojIEFwcGx5aW5nIGEgMi1mb2xkIGV2YWx1YXRpb24gKHRyYWluL3Rlc3QpClJlcXVpcmVkIGZvciBzYXZpbmcgdGhlIG1vZGVsIGFuZCB1c2UgaXQgaW4gZnVydGhlciBleHBlcmltZW50cwoKYGBge3J9CnJlc3VsdF90cmFpbl90ZXN0PC1ldmFsdWF0ZV9tb2RlbF90cmFpbl90ZXN0KHRyYWluX2RhdGFzZXRfa2VyYXMgPSBkYXRhc2V0cyR0cmFpbiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdGVzdF9kYXRhc2V0X2tlcmFzID0gZGF0YXNldHMkdGVzdCwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbW9kZWxmdW4gPSBrZXJhc19tb2RlbF9jbm5fYXJnZW5jb24sCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGV4cGVyaW1lbnRuYW1lID0gInN0ZC1kYXRhc2V0IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbW9kZWxfcGFyYW1ldGVycz0gc2VsZWN0ZWRfcGFyYW1ldGVycykKc3RkX2RhdGFzZXRfbW9kZWw8LXJlc3VsdF90cmFpbl90ZXN0JG1vZGVsX2xlYXJuZWQkbW9kZWwKc3RkX2RhdGFzZXRfbW9kZWw8LWxvYWRfbW9kZWxfaGRmNSgic3RkLWRhdGFzZXRfbW9kZWwuaDUiKQpzdW1tYXJ5KHN0ZF9kYXRhc2V0X21vZGVsKQpgYGAKCiMgVGVzdGluZyBvbiBub3JtYWwgY3ogZG9tYWlucyAoaS5lLiBhbGwgZG9tYWlucyBhcmUgbm9ybWFsKQoKYGBge3J9CmN6X2RhdGFzZXQ8LXJlYWRfY3N2KCIuL2RhdGFzZXRzL0pJU0EyMDE4LWN6LWRhdGFzZXQuY3N2Lmd6IikKY3pfZGF0YXNldCAlPiUgZ3JvdXBfYnkobGFiZWwpICU+JSBzdW1tYXJpc2Uobj1uKCkpCmN6X2RhdGFzZXQkZG9tYWluMTwtc3RyX3NwbGl0KGN6X2RhdGFzZXQkZG9tYWluLCJcXC4iLHNpbXBsaWZ5ID0gVClbLDFdCmN6X2RhdGFzZXRfdG9rZW5pemVkPC1idWlsZF9kYXRhc2V0KGFzLm1hdHJpeChjel9kYXRhc2V0KSxtYXhsZW49NDUpCmN6X2RhdGFzZXRfeDwtY3pfZGF0YXNldF90b2tlbml6ZWQkZW5jb2RlCmN6X2RhdGFzZXRfeTwtaWZlbHNlKGdyZXBsKCJub3JtYWwiLGN6X2RhdGFzZXRfdG9rZW5pemVkJGxhYmVsKSAsMCwxKQpgYGAKCgojIyBDYWxjdWFsdGluZyBSZWNhbGwgcGVyIGNsYXNzICgwLDEpCmBgYHtyfQpwcmVkczwtZ2V0X3ByZWRpY3Rpb25zKHN0ZF9kYXRhc2V0X21vZGVsLGN6X2RhdGFzZXRfeCx0aHJlc2hvbGQgPSAwLjkgKQp0YWJsZShwcmVkcykgJT4lIGRhdGEuZnJhbWUoKSAlPiUgbXV0YXRlKHRvdGFsPXN1bShGcmVxKSxyZWNhbGw9RnJlcS90b3RhbCkgJT4lIHNlbGVjdCgtdG90YWwsLUZyZXEpICU+JSByZXNoYXBlMjo6bWVsdCgpICU+JQogIAogIAogIGdncGxvdCgpKwogIGdlb21fY29sKGFlcyh4PXZhcmlhYmxlLHk9dmFsdWUsZmlsbD1wcmVkcykpKwogIHRoZW1lX2J3KCkKYGBgCgojIyBQcmludGluZyBEb21haW5zIE5vdCBjb3JyZWN0bHkgZGV0ZWN0ZWQgYXMgbm9ybWFsCmBgYHtyfQpkYXRhLmZyYW1lKHByZWRpY3RlZF9jbGFzcz1wcmVkcyxjbGFzcz1jel9kYXRhc2V0X3ksZG9tYWluPWN6X2RhdGFzZXRfdG9rZW5pemVkJGRvbWFpbikgJT4lIGZpbHRlciAocHJlZGljdGVkX2NsYXNzPT0xKSAlPiUgc2VsZWN0KGRvbWFpbikKYGBgCgoKIyBUZXN0aW5nIG9uIG5vcm1hbCBkZ2EgZG9tYWlucyAKCmBgYHtyfQpkZ2Ffbm9ybWFsX2RhdGFzZXQ8LXJlYWRfY3N2KCIuL2RhdGFzZXRzL0pJU0EyMDE4LWRnYW5vcm1hbC1kYXRhc2V0LWZ1bGwuY3N2Lmd6IikKZGdhX25vcm1hbF9kYXRhc2V0ICU+JSBncm91cF9ieShsYWJlbCkgJT4lIHN1bW1hcmlzZShuPW4oKSkKZGdhX25vcm1hbF9kYXRhc2V0JGRvbWFpbjE8LXN0cl9zcGxpdChkZ2Ffbm9ybWFsX2RhdGFzZXQkZG9tYWluLCJcXC4iLHNpbXBsaWZ5ID0gVClbLDFdCmRnYV9ub3JtYWxfZGF0YXNldF90b2tlbml6ZWQ8LWJ1aWxkX2RhdGFzZXQoYXMubWF0cml4KGRnYV9ub3JtYWxfZGF0YXNldCksbWF4bGVuPTQ1KQpkZ2Ffbm9ybWFsX2RhdGFzZXRfeDwtZGdhX25vcm1hbF9kYXRhc2V0X3Rva2VuaXplZCRlbmNvZGUKZGdhX25vcm1hbF9kYXRhc2V0X3k8LWlmZWxzZShncmVwbCgibm9ybWFsIixkZ2Ffbm9ybWFsX2RhdGFzZXRfdG9rZW5pemVkJGxhYmVsKSAsMCwxKQpgYGAKCiMjIENhbGN1bGF0aW5nIChwbG90aW5nKSBSZWNhbGwgcGVyIGNsYXNzICgwLDEpCmBgYHtyfQpwcmVkczwtZ2V0X3ByZWRpY3Rpb25zKHN0ZF9kYXRhc2V0X21vZGVsLGRnYV9ub3JtYWxfZGF0YXNldF94LHRocmVzaG9sZCA9IDAuOSApCnRhYmxlKHByZWRzKSAlPiUgZGF0YS5mcmFtZSgpICU+JSBtdXRhdGUodG90YWw9c3VtKEZyZXEpLHJlY2FsbD1GcmVxL3RvdGFsKSAlPiUgc2VsZWN0KC10b3RhbCwtRnJlcSkgJT4lIHJlc2hhcGUyOjptZWx0KCkgJT4lCiAgZ2dwbG90KCkrCiAgZ2VvbV9jb2woYWVzKHg9dmFyaWFibGUseT12YWx1ZSxmaWxsPXByZWRzKSkrCiAgdGhlbWVfYncoKQpgYGAKCiMjIFByaW50aW5nIERvbWFpbnMgTm90IGNvcnJlY3RseSBkZXRlY3RlZCBhcyBub3JtYWwKYGBge3J9CmRhdGEuZnJhbWUocHJlZGljdGVkX2NsYXNzPXByZWRzLGNsYXNzPWRnYV9ub3JtYWxfZGF0YXNldF95LGRvbWFpbj1kZ2Ffbm9ybWFsX2RhdGFzZXRfdG9rZW5pemVkJGRvbWFpbikgJT4lIGZpbHRlciAocHJlZGljdGVkX2NsYXNzPT0xKSAlPiUgc2VsZWN0KGRvbWFpbikKYGBgCgojIyBQbG90aW4gcmVjYWxsIGJ5IG5vcm1hbERnYSB0eXBlCmBgYHtyfQpjYWxjdWxhdGVfcmVjYWxsKGRhdGEuZnJhbWUobGFiZWw9ZGdhX25vcm1hbF9kYXRhc2V0X3Rva2VuaXplZCRsYWJlbCwgY2xhc3M9ZGdhX25vcm1hbF9kYXRhc2V0X3kscHJlZGljdGVkX2NsYXNzPXByZWRzKSklPiUKICBnZ3Bsb3QoKSsKICBnZW9tX2NvbChhZXMoeD1sYWJlbCx5PXJlY2FsbCxmaWxsPWxhYmVsKSkrCiAgdGhlbWVfYncoKSt5bGFiKCJTZW5zaXRpdml0eSAoQUtBIFJlY2FsbCkiKSsKICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDQ1LCBoanVzdCA9IDEpKSsKICBndWlkZXMoY29sb3I9RkFMU0UpCmBgYAoK