cl <- makeCluster(4)
registerDoParallel(cl)
reqpkgs <- c("randomForest","caret","tidyr","dplyr")
Primera Parte
Sin normalizar y haciendo la segunda parte con secuencias de N caracteres.
Importamos CTU19
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
Sacamos los simbolos de tiempo.
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'
ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
Warning in for (name in names(hooks)) { :
cerrando la conenexion 7 (<-DESKTOP-D5ND1BA:11161) que no esta siendo utilizada
Warning in for (name in names(hooks)) { :
cerrando la conenexion 6 (<-DESKTOP-D5ND1BA:11161) que no esta siendo utilizada
Warning in for (name in names(hooks)) { :
cerrando la conenexion 5 (<-DESKTOP-D5ND1BA:11161) que no esta siendo utilizada
Warning in for (name in names(hooks)) { :
cerrando la conenexion 4 (<-DESKTOP-D5ND1BA:11161) que no esta siendo utilizada
Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)
Feature Vector
feature_vector <- function(dataset){
#Size
dataset <- dataset %>% mutate(small_s = str_count(State, '[a-c]') + str_count(State, '[A-C]') + str_count(State, '[r-t]') + str_count(State, '[R-T]') + str_count(State, '[1-3]'))
dataset <- dataset %>% mutate(medium_s = str_count(State, '[d-f]') + str_count(State, '[D-F]') + str_count(State, '[u-w]') + str_count(State, '[U-W]') + str_count(State, '[4-6]'))
dataset <- dataset %>% mutate(large_s = str_count(State, '[g-i]') + str_count(State, '[G-I]') + str_count(State, '[x-z]') + str_count(State, '[X-Z]') + str_count(State, '[7-9]'))
#Periodicity
dataset <- dataset %>% mutate(strong_p = str_count(State, '[a-i]'))
dataset <- dataset %>% mutate(weak_p = str_count(State, '[A-I]'))
dataset <- dataset %>% mutate(weak_np = str_count(State, '[r-z]'))
dataset <- dataset %>% mutate(strong_np = str_count(State, '[R-Z]'))
#Duration
dataset <- dataset %>% mutate(short_d = str_count(State, '[a|A|r|R|1|d|D|u|U|4|g|G|x|X|7]'))
dataset <- dataset %>% mutate(medium_d = str_count(State, '[b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8]'))
dataset <- dataset %>% mutate(long_d = str_count(State, '[c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9]'))
}
ctu19 <- feature_vector(ctu19)
ctu19 %>% head(50)
Dejamos solo las columnas que nos interesan
ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(50)
Random Forest
test_model <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
CTU19 puro
ctu19Puro_CM <- c()
ctu19Puro_CM <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind
) %dopar% {
test_model(ctu19)
}
promedio_Puro <- c()
prom <- rowMeans(ctu19Puro_CM)
standard_deviation <- rowSds(ctu19Puro_CM)
promedio_Puro <- promedio_Puro %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Puro %>% as.data.frame()
CTU19 con DownSampling
test_model_down <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
down_train_data <- down_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=down_train_data ,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Down_CM <- c()
ctu19Down_CM <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind
) %dopar% {
test_model_down(ctu19)
}
promedio_Down <- c()
prom <- rowMeans(ctu19Down_CM)
standard_deviation <- rowSds(ctu19Down_CM)
promedio_Down <- promedio_Down %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Down %>% as.data.frame()
CTU19 con UpSampling
test_model_up <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
up_train_data <- up_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=up_train_data ,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Up_CM <- c()
ctu19Up_CM <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind
) %dopar% {
test_model_up(ctu19)
}
promedio_Up <- c()
prom <- rowMeans(ctu19Up_CM)
standard_deviation <- rowSds(ctu19Up_CM)
promedio_Up <- promedio_Up %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Up %>% as.data.frame()
Segunda Parte
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
Sacamos los simbolos de tiempo.
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'
ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)
Armamos los dataset de tamaño N
Tamaño 10
ctu19_10 <- ctu19
ctu19_10$State <- ctu19$State %>% substr(1,10)
Tamaño 20
ctu19_20 <- ctu19
ctu19_20$State <- ctu19$State %>% substr(1,20)
Tamaño 40
ctu19_40 <- ctu19
ctu19_40$State <- ctu19$State %>% substr(1,40)
Tamaño 80
ctu19_80 <- ctu19
ctu19_80$State <- ctu19$State %>% substr(1,80)
Tamaño 100
ctu19_100 <- ctu19
ctu19_100$State <- ctu19$State %>% substr(1,100)
Tamaño 200
ctu19_200 <- ctu19
ctu19_200$State <- ctu19$State %>% substr(1,200)
Tamaño 400
ctu19_400 <- ctu19
ctu19_400$State <- ctu19$State %>% substr(1,400)
Tamaño 800
ctu19_800 <- ctu19
ctu19_800$State <- ctu19$State %>% substr(1,800)
Tamaño 1000
ctu19_1000 <- ctu19
ctu19_1000$State <- ctu19$State %>% substr(1,1000)
Vector de DataSets
dataSets <- list(ctu19_10,ctu19_20,ctu19_40,ctu19_80,ctu19_100,ctu19_200,ctu19_400,ctu19_800,ctu19_1000)
Feature Vector
dataSets <- lapply(dataSets, function(x) x <- feature_vector(as.data.frame(x)))
Limpieza
Dejamos solo las columnas que nos interesan
limpieza <- function(x){
x <- x %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
x$LabelName <- as.factor(x$LabelName)
return(x)
}
dataSets <- lapply(dataSets, function(x) x <- limpieza(x))
lapply(dataSets, function(x) as.data.frame(x) %>% head(50))
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
Random Forest
CTU19 puro
ctu19_puro_DS <- lapply(dataSets, function(x) {
ctu19Puro_DS <- c()
ctu19Puro_DS <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind,
.export = c("test_model")
) %dopar% {
test_model(as.data.frame(x))
}
return(ctu19Puro_DS)
})
promedio_Puro_DS <- lapply(ctu19_puro_DS, function(x){
resp <- c()
prom <- rowMeans(x)
standard_deviation <- rowSds(x)
resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
return(resp)
})
lapply(promedio_Puro_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
CTU19 Down
ctu19_Down_DS <- lapply(dataSets, function(x) {
resp <- c()
resp <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind,
.export = c("test_model_down")
) %dopar% {
test_model_down(as.data.frame(x))
}
return(resp)
})
promedio_Down_DS <- lapply(ctu19_Down_DS, function(x){
resp <- c()
prom <- rowMeans(x)
standard_deviation <- rowSds(x)
resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
return(resp)
})
lapply(promedio_Down_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
CTU19 Up
ctu19_Up_DS <- lapply(dataSets, function(x) {
resp <- c()
resp <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind,
.export = c("test_model_up")
) %dopar% {
test_model_up(as.data.frame(x))
}
return(resp)
})
promedio_Up_DS <- lapply(ctu19_Up_DS, function(x){
resp <- c()
prom <- rowMeans(x)
standard_deviation <- rowSds(x)
resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
return(resp)
})
lapply(promedio_Up_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
