library(dplyr)
library(ggplot2)
library(lattice)
library(tidyr)
library(caret)
library(randomForest)
Eliminamos X1
ctu19=readr::read_delim(file="//home/harpo/Dropbox/ongoing-work/git-repos/deepactivelearning/datasets/ctu19_result.csv",delim = ',')
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
InitialIp = [31mcol_character()[39m,
EndIP = [31mcol_character()[39m,
Port = [32mcol_double()[39m,
Proto = [31mcol_character()[39m,
State = [31mcol_character()[39m,
LabelName = [31mcol_character()[39m
)
22184 parsing failures.
row col expected actual file
2940 Port no trailing characters x00d7 '//home/harpo/Dropbox/ongoing-work/git-repos/deepactivelearning/datasets/ctu19_result.csv'
2941 Port no trailing characters x00db '//home/harpo/Dropbox/ongoing-work/git-repos/deepactivelearning/datasets/ctu19_result.csv'
2942 Port no trailing characters x00ea '//home/harpo/Dropbox/ongoing-work/git-repos/deepactivelearning/datasets/ctu19_result.csv'
2943 Port no trailing characters x00ff '//home/harpo/Dropbox/ongoing-work/git-repos/deepactivelearning/datasets/ctu19_result.csv'
2944 Port no trailing characters x01d1 '//home/harpo/Dropbox/ongoing-work/git-repos/deepactivelearning/datasets/ctu19_result.csv'
.... .... ...................... ...... ..........................................................................................
See problems(...) for more details.
ctu19<-ctu19 %>% select(-X1)
ctu19 %>% head(100) #ojo que aveces si se muestra un dataframe grande en un notebok, este puede explotar ;-)
Descartamos aquellas que tengan 4 simbolos o menos.
ctu19 <- ctu19 %>% filter(nchar(State) > 4) # Version usando pipes. Fijate que a filter no necesito pasarle el dataframe como parametro.
### Version original
#ctu19 <- filter(ctu19,nchar(ctu19$State) > 4)
Otra opcion para eliminar
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State)) # Version usado pipes
### Version original
#for (i in 1:length(ctu19$State)){
# ctu19$State[i] <- substr(ctu19$State[i],5,nchar(ctu19$State[i]))
#}
La version final del dataset quedaria asi:
ctu19 %>% head(100)
Armamos la matriz que se usara para crear los feature vectors.
size_small <- c('a','b','c','A','B','C','r','s','t','R','S','T',1,2,3)
size_medium <- c('d','e','f','D','E','F','u','v','w','U','V','W',4,5,6)
size_large <- c('g','h','i','G','H','I','x','y','z','X','Y','Z',7,8,9)
strong_per <- c('a','b','c','d','f','g','h','i')
weak_per <- c('A','B','C','D','E','F','G','H','I')
weak_nonper <- c('r','s','t','u','v','w','x','y','z')
strong_nonper <- c('R','S','T','U','V','W','X','Y','Z')
no_data <- c(1,2,3,4,5,6,7,8,9)
dur_short <- c('a','A','r','R',1,'d','D','u','U',4,'g','G','x','X',7)
dur_med <- c('b','B','s','S',2,'e','E','v','V',5,'h','H','y','Y',8)
dur_long <- c('c','C','t','T',3,'f','F','w','W',6,'i','I','z','Z',9)
time_simbol <- c('.',',','+','*',0)
Creacion de los Feature Vectors
ft0 <- c()
ft1 <- c()
ft2 <- c()
ft3 <- c()
ft4 <- c()
ft5 <- c()
ft6 <- c()
ft7 <- c()
ft8 <- c()
ft9 <- c()
for (i in 1:length(ctu19$State)){
feature_vector <- c(0,0,0,0,0,0,0,0,0,0)
longitud_cadena <- 0
for (j in 1:nchar(ctu19$State[i])){
simbolo <- substr(ctu19$State[i],j,j)
if (!(is.element(simbolo,time_simbol))){
longitud_cadena <- longitud_cadena + 1
}
if (is.element(simbolo,size_small)){
feature_vector[8] <- feature_vector[8] + 1
}else{
if (is.element(simbolo,size_medium)){
feature_vector[9] <- feature_vector[9] + 1
} else{
if (is.element(simbolo,size_large)){
feature_vector[10] <- feature_vector[10] + 1
}
}
}
if (is.element(simbolo,dur_short)){
feature_vector[5] <- feature_vector[5] + 1
}else{
if (is.element(simbolo,dur_med)){
feature_vector[6] <- feature_vector[6] + 1
} else{
if (is.element(simbolo,dur_long)){
feature_vector[7] <- feature_vector[7] + 1
}
}
}
if (is.element(simbolo,strong_per)){
feature_vector[1] <- feature_vector[1] + 1
}else{
if (is.element(simbolo,weak_per)){
feature_vector[2] <- feature_vector[2] + 1
} else{
if (is.element(simbolo,weak_nonper)){
feature_vector[3] <- feature_vector[3] + 1
}else{
if (is.element(simbolo,strong_nonper)){
feature_vector[4] <- feature_vector[4] + 1
}
}
}
}
}
feature_vector <- feature_vector/longitud_cadena
feature_vector <- round(feature_vector,3)
ft0 <- c(ft0,feature_vector[1])
ft1 <- c(ft1,feature_vector[2])
ft2 <- c(ft2,feature_vector[3])
ft3 <- c(ft3,feature_vector[4])
ft4 <- c(ft4,feature_vector[5])
ft5 <- c(ft5,feature_vector[6])
ft6 <- c(ft6,feature_vector[7])
ft7 <- c(ft7,feature_vector[8])
ft8 <- c(ft8,feature_vector[9])
ft9 <- c(ft9,feature_vector[10])
}
Los integramos al ctu19
ctu19 <- cbind(ctu19,ft0)
ctu19 <- cbind(ctu19,ft1)
ctu19 <- cbind(ctu19,ft2)
ctu19 <- cbind(ctu19,ft3)
ctu19 <- cbind(ctu19,ft4)
ctu19 <- cbind(ctu19,ft5)
ctu19 <- cbind(ctu19,ft6)
ctu19 <- cbind(ctu19,ft7)
ctu19 <- cbind(ctu19,ft8)
ctu19 <- cbind(ctu19,ft9)
Eliminamos todas las columnas y dejamos solo LabelName y los Feature Vectors.
ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(100)
Train 70% - Test 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
nrow(train_data)
[1] 19191
nrow(test_data)
[1] 8223
Vemos que hay un gran desvalance en el ctu19, con muchas botnets y pocas normales.
train_data %>% group_by(LabelName) %>% summarise(total=n())
Creacion del modelo
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
Claramente vemos que tenemos un problema con las normales.
prediction <-predict(modelo_randomforest,test_data)
confusionMatrix(prediction, test_data$LabelName)
Confusion Matrix and Statistics
Reference
Prediction Botnet Normal
Botnet 5570 316
Normal 69 162
Accuracy : 0.9371
95% CI : (0.9307, 0.943)
No Information Rate : 0.9219
P-Value [Acc > NIR] : 2.79e-06
Kappa : 0.4278
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9878
Specificity : 0.3389
Pos Pred Value : 0.9463
Neg Pred Value : 0.7013
Prevalence : 0.9219
Detection Rate : 0.9106
Detection Prevalence : 0.9622
Balanced Accuracy : 0.6633
'Positive' Class : Botnet
ft3 es periocidad, ft5 y ft6 duracion y ft9 tamaño.
varImpPlot(modelo_randomforest,main = "Importancia de predictores")
down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
down_train_data %>% group_by(LabelName) %>% summarise(total=n())
down_train_data <- down_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=down_train_data,na.action = na.omit)
Vemos que tiene muchos problemas con los normales, ya que etiqueto a 1045 como normales y eran botnets.
prediction <-predict(modelo_randomforest,test_data)
confusionMatrix(prediction, test_data$LabelName)
Confusion Matrix and Statistics
Reference
Prediction Botnet Normal
Botnet 4503 30
Normal 1136 448
Accuracy : 0.8094
95% CI : (0.7993, 0.8192)
No Information Rate : 0.9219
P-Value [Acc > NIR] : 1
Kappa : 0.3574
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7985
Specificity : 0.9372
Pos Pred Value : 0.9934
Neg Pred Value : 0.2828
Prevalence : 0.9219
Detection Rate : 0.7361
Detection Prevalence : 0.7410
Balanced Accuracy : 0.8679
'Positive' Class : Botnet
Vemos que cambiaron los predictores mas importantes, siendo ahora el ft7 duracion, el ft9 tamaño y el ft0 periosidad.
varImpPlot(modelo_randomforest,main = "Importancia de predictores")
up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
up_train_data %>% group_by(LabelName) %>% summarise(total=n())
up_train_data <- up_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=up_train_data,na.action = na.omit)
Seguimos teniendo problemas con los normales. Notamos una mejora pero vemos que por mas de que ahora usemos la misma cantidad de botnets y de normales para entrenar el modelo sigue teniendo problemas etiquetando como normales a botnets. Sin usar Sampling vemos que no se equivoca tanto con los botnets, pero esto es porque estamos entrenando y testeando con muchas botnets.
prediction <-predict(modelo_randomforest,test_data)
confusionMatrix(prediction, test_data$LabelName)
Confusion Matrix and Statistics
Reference
Prediction Botnet Normal
Botnet 4790 79
Normal 849 399
Accuracy : 0.8483
95% CI : (0.8391, 0.8572)
No Information Rate : 0.9219
P-Value [Acc > NIR] : 1
Kappa : 0.3938
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.8494
Specificity : 0.8347
Pos Pred Value : 0.9838
Neg Pred Value : 0.3197
Prevalence : 0.9219
Detection Rate : 0.7831
Detection Prevalence : 0.7960
Balanced Accuracy : 0.8421
'Positive' Class : Botnet
varImpPlot(modelo_randomforest,main = "Importancia de predictores")