ctu19 <- ctu19 %>% select(-X1)
Error: Can't subset columns that don't exist.
[31mx[39m Column `X1` doesn't exist.
Sacamos los simbolos de tiempo.
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'
ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)
feature_vector <- function(dataset){
#Size
dataset <- dataset %>% mutate(small_s = str_count(State, '[a-c]') + str_count(State, '[A-C]') + str_count(State, '[r-t]') + str_count(State, '[R-T]') + str_count(State, '[1-3]'))
dataset <- dataset %>% mutate(medium_s = str_count(State, '[d-f]') + str_count(State, '[D-F]') + str_count(State, '[u-w]') + str_count(State, '[U-W]') + str_count(State, '[4-6]'))
dataset <- dataset %>% mutate(large_s = str_count(State, '[g-i]') + str_count(State, '[G-I]') + str_count(State, '[x-z]') + str_count(State, '[X-Z]') + str_count(State, '[7-9]'))
#Periodicity
dataset <- dataset %>% mutate(strong_p = str_count(State, '[a-i]'))
dataset <- dataset %>% mutate(weak_p = str_count(State, '[A-I]'))
dataset <- dataset %>% mutate(weak_np = str_count(State, '[r-z]'))
dataset <- dataset %>% mutate(strong_np = str_count(State, '[R-Z]'))
#Duration
dataset <- dataset %>% mutate(short_d = str_count(State, '[a|A|r|R|1|d|D|u|U|4|g|G|x|X|7]'))
dataset <- dataset %>% mutate(medium_d = str_count(State, '[b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8]'))
dataset <- dataset %>% mutate(long_d = str_count(State, '[c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9]'))
}
ctu19 <- feature_vector(ctu19)
ctu19 %>% head(50)
Dejamos solo las columnas que nos interesan
ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(50)
test_model <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Puro_CM <- c()
foreach(i = 1:10) %do% {ctu19Puro_CM <- ctu19Puro_CM %>% cbind(test_model(ctu19))}
promedio_Puro <- c()
promedio_Puro <- rowMeans(ctu19Puro_CM)
promedio_Puro %>% as.data.frame()
test_model_down <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
down_train_data <- down_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=down_train_data ,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Down_CM <- c()
foreach(i = 1:10) %do% {ctu19Down_CM <- ctu19Down_CM %>% cbind(test_model_down(ctu19))}
promedio_Down <- c()
promedio_Down <- rowMeans(ctu19Down_CM)
promedio_Down %>% as.data.frame()
test_model_up <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
up_train_data <- up_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=up_train_data ,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Up_CM <- c()
foreach(i = 1:10) %do% {ctu19Up_CM <- ctu19Up_CM %>% cbind(test_model_up(ctu19))}
promedio_Up <- c()
promedio_Up <- rowMeans(ctu19Up_CM)
promedio_Up %>% as.data.frame()
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
Sacamos los simbolos de tiempo.
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'
ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)
ctu19_10 <- ctu19 %>% filter(nchar(State) > 9)
ctu19_10$State <- ctu19_10$State %>% substr(1,10)
ctu19_10
ctu19_20 <- ctu19 %>% filter(nchar(State) > 19)
ctu19_20$State <- ctu19_20$State %>% substr(1,20)
ctu19_40 <- ctu19 %>% filter(nchar(State) > 39)
ctu19_40$State <- ctu19_40$State %>% substr(1,40)
ctu19_80 <- ctu19 %>% filter(nchar(State) > 79)
ctu19_80$State <- ctu19_80$State %>% substr(1,80)
ctu19_100 <- ctu19 %>% filter(nchar(State) > 99)
ctu19_100$State <- ctu19_100$State %>% substr(1,100)
ctu19_200 <- ctu19 %>% filter(nchar(State) > 199)
ctu19_200$State <- ctu19_200$State %>% substr(1,200)
ctu19_400 <- ctu19 %>% filter(nchar(State) > 399)
ctu19_400$State <- ctu19_400$State %>% substr(1,400)
ctu19_800 <- ctu19 %>% filter(nchar(State) > 799)
ctu19_800$State <- ctu19_800$State %>% substr(1,800)
ctu19_1000 <- ctu19 %>% filter(nchar(State) > 999)
ctu19_1000$State <- ctu19_1000$State %>% substr(1,1000)
dataSets <- list(ctu19_10,ctu19_20,ctu19_40,ctu19_80,ctu19_100,ctu19_200,ctu19_400,ctu19_800,ctu19_1000)
dataSets <- lapply(dataSets, function(x) x <- feature_vector(as.data.frame(x)))
Dejamos solo las columnas que nos interesan
limpieza <- function(x){
x <- x %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
x$LabelName <- as.factor(x$LabelName)
return(x)
}
dataSets <- lapply(dataSets, function(x) x <- limpieza(x))
lapply(dataSets, function(x) as.data.frame(x) %>% head(50))
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
ctu19_puro_DS <- lapply(dataSets, function(x) {
ctu19Puro_DS <- c()
foreach(i = 1:10) %do% {ctu19Puro_DS <- ctu19Puro_DS %>% cbind(test_model(as.data.frame(x)))}
return(ctu19Puro_DS)
})
promedio_Puro_DS <- lapply(ctu19_puro_DS, function(x){
resp <- c()
resp <- rowMeans(x)
return(resp)
})
lapply(promedio_Puro_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
ctu19_Down_DS <- lapply(dataSets, function(x) {
resp <- c()
foreach(i = 1:10) %do% {resp <- resp %>% cbind(test_model_down(as.data.frame(x)))}
return(resp)
})
promedio_Down_DS <- lapply(ctu19_Down_DS, function(x){
resp <- c()
resp <- rowMeans(x)
return(resp)
})
lapply(promedio_Down_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
ctu19_Up_DS <- lapply(dataSets, function(x) {
resp <- c()
foreach(i = 1:10) %do% {resp <- resp %>% cbind(test_model_up(as.data.frame(x)))}
return(resp)
})
promedio_Up_DS <- lapply(ctu19_Up_DS, function(x){
resp <- c()
resp <- rowMeans(x)
return(resp)
})
lapply(promedio_Up_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA