cl <- makeCluster(4)
registerDoParallel(cl)
reqpkgs <- c("randomForest","caret","tidyr","dplyr")
Primera Parte
Sin normalizar y haciendo la segunda parte con secuencias de N caracteres.
Importamos CTU19
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
Sacamos los simbolos de tiempo.
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'
ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)
Feature Vector
feature_vector <- function(dataset){
#Size
dataset <- dataset %>% mutate(small_s = (str_count(State, '[a-c]') + str_count(State, '[A-C]') + str_count(State, '[r-t]') + str_count(State, '[R-T]') + str_count(State, '[1-3]'))/nchar(State))
dataset <- dataset %>% mutate(medium_s = (str_count(State, '[d-f]') + str_count(State, '[D-F]') + str_count(State, '[u-w]') + str_count(State, '[U-W]') + str_count(State, '[4-6]'))/nchar(State))
dataset <- dataset %>% mutate(large_s = (str_count(State, '[g-i]') + str_count(State, '[G-I]') + str_count(State, '[x-z]') + str_count(State, '[X-Z]') + str_count(State, '[7-9]'))/nchar(State))
#Periodicity
dataset <- dataset %>% mutate(strong_p = (str_count(State, '[a-i]'))/nchar(State))
dataset <- dataset %>% mutate(weak_p = (str_count(State, '[A-I]'))/nchar(State))
dataset <- dataset %>% mutate(weak_np = (str_count(State, '[r-z]'))/nchar(State))
dataset <- dataset %>% mutate(strong_np = (str_count(State, '[R-Z]'))/nchar(State))
#Duration
dataset <- dataset %>% mutate(short_d = (str_count(State, '[a|A|r|R|1|d|D|u|U|4|g|G|x|X|7]'))/nchar(State))
dataset <- dataset %>% mutate(medium_d = (str_count(State, '[b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8]'))/nchar(State))
dataset <- dataset %>% mutate(long_d = (str_count(State, '[c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9]'))/nchar(State))
}
ctu19 <- feature_vector(ctu19)
ctu19 %>% head(50)
Dejamos solo las columnas que nos interesan
ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(50)
Random Forest
test_model <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
CTU19 puro
ctu19Puro_CM <- c()
ctu19Puro_CM <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind
) %dopar% {
test_model(ctu19)
}
promedio_Puro <- c()
prom <- rowMeans(ctu19Puro_CM)
standard_deviation <- rowSds(ctu19Puro_CM)
promedio_Puro <- promedio_Puro %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Puro %>% as.data.frame()
CTU19 con DownSampling
test_model_down <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
down_train_data <- down_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=down_train_data ,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Down_CM <- c()
ctu19Down_CM <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind
) %dopar% {
test_model_down(ctu19)
}
promedio_Down <- c()
prom <- rowMeans(ctu19Down_CM)
standard_deviation <- rowSds(ctu19Down_CM)
promedio_Down <- promedio_Down %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Down %>% as.data.frame()
CTU19 con UpSampling
test_model_up <- function(ctu19){
#Train 70% y 30%
train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
train_data <- ctu19[train_data_ind,]
test_data <- ctu19[-train_data_ind,]
up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
up_train_data <- up_train_data %>% select(-Class)
set.seed(123)
modelo_randomforest <- randomForest(LabelName~., data=up_train_data ,na.action = na.omit)
prediction <- predict(modelo_randomforest,test_data)
conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
return(conf_MAt$byClass)
}
ctu19Up_CM <- c()
ctu19Up_CM <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind
) %dopar% {
test_model_up(ctu19)
}
promedio_Up <- c()
prom <- rowMeans(ctu19Up_CM)
standard_deviation <- rowSds(ctu19Up_CM)
promedio_Up <- promedio_Up %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Up %>% as.data.frame()
Segunda Parte
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
Sacamos los simbolos de tiempo.
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'
ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)
Armamos los dataset de tamaño N
Tamaño 10
ctu19_10 <- ctu19
ctu19_10$State <- ctu19$State %>% substr(1,10)
Tamaño 20
ctu19_20 <- ctu19
ctu19_20$State <- ctu19$State %>% substr(1,20)
Tamaño 40
ctu19_40 <- ctu19
ctu19_40$State <- ctu19$State %>% substr(1,40)
Tamaño 80
ctu19_80 <- ctu19
ctu19_80$State <- ctu19$State %>% substr(1,80)
Tamaño 100
ctu19_100 <- ctu19
ctu19_100$State <- ctu19$State %>% substr(1,100)
Tamaño 200
ctu19_200 <- ctu19
ctu19_200$State <- ctu19$State %>% substr(1,200)
Tamaño 400
ctu19_400 <- ctu19
ctu19_400$State <- ctu19$State %>% substr(1,400)
Tamaño 800
ctu19_800 <- ctu19
ctu19_800$State <- ctu19$State %>% substr(1,800)
Tamaño 1000
ctu19_1000 <- ctu19
ctu19_1000$State <- ctu19$State %>% substr(1,1000)
Vector de DataSets
dataSets <- list(ctu19_10,ctu19_20,ctu19_40,ctu19_80,ctu19_100,ctu19_200,ctu19_400,ctu19_800,ctu19_1000)
Feature Vector
dataSets <- lapply(dataSets, function(x) x <- feature_vector(as.data.frame(x)))
Limpieza
Dejamos solo las columnas que nos interesan
limpieza <- function(x){
x <- x %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
x$LabelName <- as.factor(x$LabelName)
return(x)
}
dataSets <- lapply(dataSets, function(x) x <- limpieza(x))
lapply(dataSets, function(x) as.data.frame(x) %>% head(50))
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
Random Forest
CTU19 puro
ctu19_puro_DS <- lapply(dataSets, function(x) {
ctu19Puro_DS <- c()
ctu19Puro_DS <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind,
.export = c("test_model")
) %dopar% {
test_model(as.data.frame(x))
}
return(ctu19Puro_DS)
})
promedio_Puro_DS <- lapply(ctu19_puro_DS, function(x){
resp <- c()
prom <- rowMeans(x)
standard_deviation <- rowSds(x)
resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
return(resp)
})
lapply(promedio_Puro_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
CTU19 Down
ctu19_Down_DS <- lapply(dataSets, function(x) {
resp <- c()
resp <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind,
.export = c("test_model_down")
) %dopar% {
test_model_down(as.data.frame(x))
}
return(resp)
})
promedio_Down_DS <- lapply(ctu19_Down_DS, function(x){
resp <- c()
prom <- rowMeans(x)
standard_deviation <- rowSds(x)
resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
return(resp)
})
lapply(promedio_Down_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
CTU19 Up
ctu19_Up_DS <- lapply(dataSets, function(x) {
resp <- c()
resp <- foreach::foreach(i = 1:10,
.packages = reqpkgs,
.combine = cbind,
.export = c("test_model_up")
) %dopar% {
test_model_up(as.data.frame(x))
}
return(resp)
})
promedio_Up_DS <- lapply(ctu19_Up_DS, function(x){
resp <- c()
prom <- rowMeans(x)
standard_deviation <- rowSds(x)
resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
return(resp)
})
lapply(promedio_Up_DS, function(x) x %>% as.data.frame())
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
NA
---
title: "Botnets_CTU19_N/HN"
output: html_notebook
---

```{r include=FALSE}
library(dplyr)
library(ggplot2)
library(lattice)
library(tidyr)
library(caret)
library(randomForest)
library(doParallel)
library(stringr)
library(matrixStats)
```

```{r}
cl <- makeCluster(4)
registerDoParallel(cl)
```


```{r}
reqpkgs <- c("randomForest","caret","tidyr","dplyr") 
```

# Primera Parte

Sin normalizar y haciendo la segunda parte con secuencias de N caracteres. 

```{r include=FALSE}
ctu19 = readr::read_delim(file="C:/Users/DuzzLogic/Google Drive/Cosas/LABSIN/Botnets/ctu/ctu19.csv",delim = ',')
```

## Importamos CTU19

```{r}
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
```


Sacamos los simbolos de tiempo.

```{r}
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'

ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
```


Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.

```{r echo=TRUE}
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
```

```{r}
ctu19 %>% head(50)
```

## Feature Vector

```{r echo=TRUE}
feature_vector <- function(dataset){
  #Size
  dataset <- dataset %>% mutate(small_s = (str_count(State, '[a-c]') + str_count(State, '[A-C]') + str_count(State, '[r-t]') + str_count(State, '[R-T]') + str_count(State, '[1-3]'))/nchar(State))
  dataset <- dataset %>% mutate(medium_s = (str_count(State, '[d-f]') + str_count(State, '[D-F]') + str_count(State, '[u-w]') + str_count(State, '[U-W]') + str_count(State, '[4-6]'))/nchar(State))
  dataset <- dataset %>% mutate(large_s = (str_count(State, '[g-i]') + str_count(State, '[G-I]') + str_count(State, '[x-z]') + str_count(State, '[X-Z]') + str_count(State, '[7-9]'))/nchar(State))
  
  #Periodicity
  dataset <- dataset %>% mutate(strong_p = (str_count(State, '[a-i]'))/nchar(State))
  dataset <- dataset %>% mutate(weak_p = (str_count(State, '[A-I]'))/nchar(State))
  dataset <- dataset %>% mutate(weak_np = (str_count(State, '[r-z]'))/nchar(State))
  dataset <- dataset %>% mutate(strong_np = (str_count(State, '[R-Z]'))/nchar(State))
  
  #Duration
  dataset <- dataset %>% mutate(short_d = (str_count(State, '[a|A|r|R|1|d|D|u|U|4|g|G|x|X|7]'))/nchar(State))
  dataset <- dataset %>% mutate(medium_d = (str_count(State, '[b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8]'))/nchar(State))
  dataset <- dataset %>% mutate(long_d = (str_count(State, '[c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9]'))/nchar(State))
  
}

```

```{r}
ctu19 <- feature_vector(ctu19)
```

```{r}
ctu19 %>% head(50)
```

Dejamos solo las columnas que nos interesan

```{r}
ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(50)
```

## Random Forest

```{r echo=TRUE}
test_model <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
```


### CTU19 puro

```{r echo=TRUE}
ctu19Puro_CM <- c()
ctu19Puro_CM <- foreach::foreach(i = 1:10,
                 .packages = reqpkgs,
                 .combine = cbind
) %dopar% {
                  test_model(ctu19) 
                 }
```

```{r}
promedio_Puro <- c()
prom <- rowMeans(ctu19Puro_CM)
standard_deviation <- rowSds(ctu19Puro_CM)
promedio_Puro <- promedio_Puro %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Puro %>% as.data.frame()
```

### CTU19 con DownSampling

```{r echo=TRUE}
test_model_down <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
  down_train_data <- down_train_data %>% select(-Class)
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=down_train_data ,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
```

```{r echo=TRUE}
ctu19Down_CM <- c()
ctu19Down_CM <- foreach::foreach(i = 1:10,
                 .packages = reqpkgs,
                 .combine = cbind
) %dopar% {
                  test_model_down(ctu19) 
                 }
```

```{r}
promedio_Down <- c()
prom <- rowMeans(ctu19Down_CM)
standard_deviation <- rowSds(ctu19Down_CM)
promedio_Down <- promedio_Down %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Down %>% as.data.frame()
```

### CTU19 con UpSampling

```{r echo=TRUE}
test_model_up <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
  up_train_data <- up_train_data %>% select(-Class)
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=up_train_data ,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
```

```{r echo=TRUE}
ctu19Up_CM <- c()
ctu19Up_CM <- foreach::foreach(i = 1:10,
                 .packages = reqpkgs,
                 .combine = cbind
) %dopar% {
                  test_model_up(ctu19) 
                 }
```

```{r}
promedio_Up <- c()
prom <- rowMeans(ctu19Up_CM)
standard_deviation <- rowSds(ctu19Up_CM)
promedio_Up <- promedio_Up %>% cbind(prom) %>% cbind(standard_deviation)
promedio_Up %>% as.data.frame() 
```

# Segunda Parte

```{r include=FALSE}
ctu19 = readr::read_delim(file="C:/Users/DuzzLogic/Google Drive/Cosas/LABSIN/Botnets/ctu/ctu19.csv",delim = ',')
```

```{r}
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
```


Sacamos los simbolos de tiempo.

```{r}
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'

ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
```


Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.

```{r echo=TRUE}
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
```

```{r}
ctu19 %>% head(50)
```

## Armamos los dataset de tamaño N

### Tamaño 10

```{r}
ctu19_10 <- ctu19
ctu19_10$State <- ctu19$State %>% substr(1,10)
```


### Tamaño 20

```{r}
ctu19_20 <- ctu19
ctu19_20$State <- ctu19$State %>% substr(1,20)
```


### Tamaño 40

```{r}
ctu19_40 <- ctu19
ctu19_40$State <- ctu19$State %>% substr(1,40)
```


### Tamaño 80

```{r}
ctu19_80 <- ctu19
ctu19_80$State <- ctu19$State %>% substr(1,80)
```


### Tamaño 100

```{r}
ctu19_100 <- ctu19
ctu19_100$State <- ctu19$State %>% substr(1,100)
```


### Tamaño 200

```{r}
ctu19_200 <- ctu19
ctu19_200$State <- ctu19$State %>% substr(1,200)
```


### Tamaño 400

```{r}
ctu19_400 <- ctu19
ctu19_400$State <- ctu19$State %>% substr(1,400)
```


### Tamaño 800

```{r}
ctu19_800 <- ctu19
ctu19_800$State <- ctu19$State %>% substr(1,800)
```


### Tamaño 1000

```{r}
ctu19_1000 <- ctu19
ctu19_1000$State <- ctu19$State %>% substr(1,1000)
```

## Vector de DataSets

```{r}
dataSets <- list(ctu19_10,ctu19_20,ctu19_40,ctu19_80,ctu19_100,ctu19_200,ctu19_400,ctu19_800,ctu19_1000)
```


## Feature Vector

```{r}
dataSets <- lapply(dataSets, function(x) x <- feature_vector(as.data.frame(x)))
```


## Limpieza

Dejamos solo las columnas que nos interesan

```{r}
limpieza <- function(x){
  x <- x %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
  x$LabelName <- as.factor(x$LabelName)
  return(x)
}
```

```{r}
dataSets <- lapply(dataSets, function(x) x <- limpieza(x))
```

```{r}
lapply(dataSets, function(x) as.data.frame(x) %>% head(50))
```


## Random Forest

### CTU19 puro

```{r}
ctu19_puro_DS <- lapply(dataSets, function(x) {
  ctu19Puro_DS <- c()
  ctu19Puro_DS <- foreach::foreach(i = 1:10,
                 .packages = reqpkgs,
                 .combine = cbind,
                 .export = c("test_model")
  ) %dopar% {
                test_model(as.data.frame(x)) 
                 }
  return(ctu19Puro_DS)
})
```

```{r}
promedio_Puro_DS <- lapply(ctu19_puro_DS, function(x){
  resp <- c()
  prom <- rowMeans(x)
  standard_deviation <- rowSds(x)
  resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
  return(resp)
})
```

```{r}
lapply(promedio_Puro_DS, function(x) x %>% as.data.frame())
```


### CTU19 Down
```{r}
ctu19_Down_DS <- lapply(dataSets, function(x) {
  resp <- c()
  resp <- foreach::foreach(i = 1:10,
                 .packages = reqpkgs,
                 .combine = cbind,
                 .export = c("test_model_down")
  ) %dopar% {
                test_model_down(as.data.frame(x)) 
                 }
  return(resp)
})
```

```{r}
promedio_Down_DS <- lapply(ctu19_Down_DS, function(x){
  resp <- c()
  prom <- rowMeans(x)
  standard_deviation <- rowSds(x)
  resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
  return(resp)
})
```

```{r}
lapply(promedio_Down_DS, function(x) x %>% as.data.frame())
```


### CTU19 Up

```{r}
ctu19_Up_DS <- lapply(dataSets, function(x) {
  resp <- c()
  resp <- foreach::foreach(i = 1:10,
                 .packages = reqpkgs,
                 .combine = cbind,
                 .export = c("test_model_up")
  ) %dopar% {
                test_model_up(as.data.frame(x)) 
                 }
  return(resp)
})
```

```{r}
promedio_Up_DS <- lapply(ctu19_Up_DS, function(x){
  resp <- c()
  prom <- rowMeans(x)
  standard_deviation <- rowSds(x)
  resp <- resp %>% cbind(prom) %>% cbind(standard_deviation)
  return(resp)
})
```

```{r}
lapply(promedio_Up_DS, function(x) x %>% as.data.frame())
```







