Create dataset
ctu19 %>% filter(State!="") %>% nrow()
[1] 27545
names(ctu19)
[1] "State" "class" "modelsize"
skim(ctu19 %>% mutate(class=as.factor(class)))
── Data Summary ────────────────────────
Values
Name ctu19 %>% mutate(class = ...
Number of rows 27545
Number of columns 3
_______________________
Column type frequency:
character 1
factor 1
numeric 1
________________________
Group variables None
── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 State 0 1 1 99997 0 16663 0
── Variable type: factor ─────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique top_counts
1 class 0 1 FALSE 2 Bot: 25945, Nor: 1600
── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 modelsize 0 1 663. 6156. 1 12 56 221 99997 ▇▁▁▁▁
create_histogram<-function(x){
valid_characters <- "$abcdefghiABCDEFGHIrstuvwxyzRSTUVWXYZ0123456789" %>% str_split("")
valid_characters <- valid_characters[[1]]
valid_characters[48]="\\."
valid_characters[49]="\\,"
valid_characters[50]="\\+"
valid_characters[51]="\\*"
freq<- (x %>% map(function(x) str_count(x,valid_characters)) %>% unlist() %>% matrix( ncol = 51, byrow = TRUE) %>% colSums())
freq<-freq/ sum( str_count(x,".") )
plot<-data.frame(freq=freq,symbols=valid_characters) %>%
ggplot()+
geom_col(aes(x=symbols,y=freq),fill='black',col='black')+
theme_bw()
plot
}
n <- (ctu19 %>% filter(class == "Normal" & modelsize <100))$State
nh<-create_histogram(n)
nh <- nh + labs(title="CTU19 seq char distribution for Normal [modelsize <100]")
m <- (ctu19 %>% filter(class != "Normal" & modelsize <100))$State
mh<-create_histogram(m)
mh <- mh + labs(title="CTU19 seq char distribution for Malware [modelsize <100]")
gridExtra::grid.arrange(nh,mh)

#source("preprocess.R")
#datasets<-build_train_test(datasetfile = "datasets/ctu13subs.csv",maxlen = ctu_maxlen)
# WARNING: to avoid regenerating de train and test sets, just uncomments the following lines
# WARNING: there is no guarantee the files saved correspond to argencon.csv. If unsure, just re-run build_train_test()
load(file='datasets/.train_dataset_keras.rd')
load(file='datasets/.test_dataset_keras.rd')
datasets<-list()
datasets$train<-train_dataset_keras
datasets$test<-test_dataset_keras
### Function Definitions ####
get_predictions <- function(model, test_dataset_x,threshold=0.5) {
predsprobs<-model %>% predict(test_dataset_x, batch_size=256)
preds<-ifelse(predsprobs>threshold,1,0)
return (preds)
}
summary(model)
_______________________________________________________________________________________________________________________________________________________
Layer (type) Output Shape Param #
=======================================================================================================================================================
input_1 (InputLayer) (None, 400) 0
_______________________________________________________________________________________________________________________________________________________
embedding (Embedding) (None, 400, 128) 6528
_______________________________________________________________________________________________________________________________________________________
lstm (LSTM) (None, 128) 131584
_______________________________________________________________________________________________________________________________________________________
dropout (Dropout) (None, 128) 0
_______________________________________________________________________________________________________________________________________________________
dense (Dense) (None, 1) 129
=======================================================================================================================================================
Total params: 138,241
Trainable params: 138,241
Non-trainable params: 0
_______________________________________________________________________________________________________________________________________________________
test_results<-data.frame(predicted_class=preds,class=ifelse(grepl("Normal",datasets$test$label) ,0,1) ,domain=datasets$test$domain,label=datasets$test$label)
#test_results
caret::confusionMatrix(as.factor(test_results$predicted_class),as.factor(test_results$class), positive='1', mode="everything" )
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 385 74
1 95 7710
Accuracy : 0.9795
95% CI : (0.9763, 0.9825)
No Information Rate : 0.9419
P-Value [Acc > NIR] : <2e-16
Kappa : 0.8092
Mcnemar's Test P-Value : 0.1239
Sensitivity : 0.9905
Specificity : 0.8021
Pos Pred Value : 0.9878
Neg Pred Value : 0.8388
Precision : 0.9878
Recall : 0.9905
F1 : 0.9892
Prevalence : 0.9419
Detection Rate : 0.9330
Detection Prevalence : 0.9445
Balanced Accuracy : 0.8963
'Positive' Class : 1
fp<-(test_results %>% mutate(modelsize=str_count(domain,".")) %>% filter(class == 0 & predicted_class == 1))
fn<-(test_results %>% mutate(modelsize=str_count(domain,".")) %>% filter(class == 1 & predicted_class == 0))
tp<-(test_results %>% mutate(modelsize=str_count(domain,".")) %>% filter(class == 1 & predicted_class == 1))
tn<-(test_results %>% mutate(modelsize=str_count(domain,".")) %>% filter(class == 0 & predicted_class == 0))
fp_domain<-fp$domain
tp_domain<-tp$domain
fn_domain<-fn$domain
tn_domain<-tn$domain
fp_h<-create_histogram(fp_domain)
fp_h <- fp_h + labs(title="CTU19 seq char distribution for False Positive")
#fp_h <- fp_h + ylim(0,500)
tp_h<-create_histogram(tp_domain)
tp_h <- tp_h + labs(title="CTU19 seq char distribution for True Positive")
#tp_h <- tp_h + ylim(0,500)
fn_h<-create_histogram(fn_domain)
fn_h <- fn_h + labs(title="CTU19 seq char distribution for False Negative")
tn_h<-create_histogram(tn_domain)
tn_h <- tn_h + labs(title="CTU19 seq char distribution for True Negative")
gridExtra::grid.arrange(fp_h,tp_h,fn_h,tn_h,ncol=1)

#library(scales)
#tp_h + scale_y_continuous(limits=c(0,1000),oob = rescale_none)
#fp_h + scale_y_continuous(limits=c(0,1000),oob = rescale_none)

pca 2D proyection
plotly::plot_ly(pca_data , type="scatter3d",
x = ~PC1, y = ~PC2, z = ~PC3, color = ~res, symbol = ~label,
colors = c('blue', 'orange',"red","green"),
opacity=0.5, marker = list(size = 3),text = ~domain)
No scatter3d mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
