Primera Parte

Importamos CTU19

ctu19 <- ctu19 %>% select(-X1)
Error: Can't subset columns that don't exist.
x Column `X1` doesn't exist.

Sacamos los simbolos de tiempo.

time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'

ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")

Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.

ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)

Feature Vector

feature_vector <- function(dataset){
  #Size
  dataset <- dataset %>% mutate(small_s = str_count(State, '[a-c]') + str_count(State, '[A-C]') + str_count(State, '[r-t]') + str_count(State, '[R-T]') + str_count(State, '[1-3]'))
  dataset <- dataset %>% mutate(medium_s = str_count(State, '[d-f]') + str_count(State, '[D-F]') + str_count(State, '[u-w]') + str_count(State, '[U-W]') + str_count(State, '[4-6]'))
  dataset <- dataset %>% mutate(large_s = str_count(State, '[g-i]') + str_count(State, '[G-I]') + str_count(State, '[x-z]') + str_count(State, '[X-Z]') + str_count(State, '[7-9]'))
  
  #Periodicity
  dataset <- dataset %>% mutate(strong_p = str_count(State, '[a-i]'))
  dataset <- dataset %>% mutate(weak_p = str_count(State, '[A-I]'))
  dataset <- dataset %>% mutate(weak_np = str_count(State, '[r-z]'))
  dataset <- dataset %>% mutate(strong_np = str_count(State, '[R-Z]'))
  
  #Duration
  dataset <- dataset %>% mutate(short_d = str_count(State, '[a|A|r|R|1|d|D|u|U|4|g|G|x|X|7]'))
  dataset <- dataset %>% mutate(medium_d = str_count(State, '[b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8]'))
  dataset <- dataset %>% mutate(long_d = str_count(State, '[c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9]'))
  
}
ctu19 <- feature_vector(ctu19)
ctu19 %>% head(50)

Dejamos solo las columnas que nos interesan

ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(50)

Random Forest

test_model <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}

CTU19 puro

ctu19Puro_CM <- c()
foreach(i = 1:10) %do% {ctu19Puro_CM <- ctu19Puro_CM %>% cbind(test_model(ctu19))}
promedio_Puro <- c()
promedio_Puro <- rowMeans(ctu19Puro_CM)
promedio_Puro %>% as.data.frame()

CTU19 con DownSampling

test_model_down <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
  down_train_data <- down_train_data %>% select(-Class)
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=down_train_data ,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
ctu19Down_CM <- c()
foreach(i = 1:10) %do% {ctu19Down_CM <- ctu19Down_CM %>% cbind(test_model_down(ctu19))}
promedio_Down <- c()
promedio_Down <- rowMeans(ctu19Down_CM)
promedio_Down %>% as.data.frame()

CTU19 con UpSampling

test_model_up <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
  up_train_data <- up_train_data %>% select(-Class)
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=up_train_data ,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
ctu19Up_CM <- c()
foreach(i = 1:10) %do% {ctu19Up_CM <- ctu19Up_CM %>% cbind(test_model_up(ctu19))}
promedio_Up <- c()
promedio_Up <- rowMeans(ctu19Up_CM)
promedio_Up %>% as.data.frame() 

Segunda Parte

ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)

Sacamos los simbolos de tiempo.

time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'

ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")

Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.

ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
ctu19 %>% head(50)

Armamos los dataset de tamaño N

Tamaño 10

ctu19_10 <- ctu19 %>% filter(nchar(State) > 9)
ctu19_10$State <- ctu19_10$State %>% substr(1,10)
ctu19_10

Tamaño 20

ctu19_20 <- ctu19 %>% filter(nchar(State) > 19)
ctu19_20$State <- ctu19_20$State %>% substr(1,20)

Tamaño 40

ctu19_40 <- ctu19 %>% filter(nchar(State) > 39)
ctu19_40$State <- ctu19_40$State %>% substr(1,40)

Tamaño 80

ctu19_80 <- ctu19 %>% filter(nchar(State) > 79)
ctu19_80$State <- ctu19_80$State %>% substr(1,80)

Tamaño 100

ctu19_100 <- ctu19 %>% filter(nchar(State) > 99)
ctu19_100$State <- ctu19_100$State %>% substr(1,100)

Tamaño 200

ctu19_200 <- ctu19 %>% filter(nchar(State) > 199)
ctu19_200$State <- ctu19_200$State %>% substr(1,200)

Tamaño 400

ctu19_400 <- ctu19 %>% filter(nchar(State) > 399)
ctu19_400$State <- ctu19_400$State %>% substr(1,400)

Tamaño 800

ctu19_800 <- ctu19 %>% filter(nchar(State) > 799)
ctu19_800$State <- ctu19_800$State %>% substr(1,800)

Tamaño 1000

ctu19_1000 <- ctu19 %>% filter(nchar(State) > 999)
ctu19_1000$State <- ctu19_1000$State %>% substr(1,1000)

Vector de DataSets

dataSets <- list(ctu19_10,ctu19_20,ctu19_40,ctu19_80,ctu19_100,ctu19_200,ctu19_400,ctu19_800,ctu19_1000)

Feature Vector

dataSets <- lapply(dataSets, function(x) x <- feature_vector(as.data.frame(x)))

Limpieza

Dejamos solo las columnas que nos interesan

limpieza <- function(x){
  x <- x %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
  x$LabelName <- as.factor(x$LabelName)
  return(x)
}
dataSets <- lapply(dataSets, function(x) x <- limpieza(x))
lapply(dataSets, function(x) as.data.frame(x) %>% head(50))
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]
NA

Random Forest

CTU19 puro

ctu19_puro_DS <- lapply(dataSets, function(x) {
  ctu19Puro_DS <- c()
  foreach(i = 1:10) %do% {ctu19Puro_DS <- ctu19Puro_DS %>% cbind(test_model(as.data.frame(x)))}
  return(ctu19Puro_DS)
})
promedio_Puro_DS <- lapply(ctu19_puro_DS, function(x){
  resp <- c()
  resp <- rowMeans(x)
  return(resp)
})
lapply(promedio_Puro_DS, function(x) x %>% as.data.frame())
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]
NA

CTU19 Down

ctu19_Down_DS <- lapply(dataSets, function(x) {
  resp <- c()
  foreach(i = 1:10) %do% {resp <- resp %>% cbind(test_model_down(as.data.frame(x)))}
  return(resp)
})
promedio_Down_DS <- lapply(ctu19_Down_DS, function(x){
  resp <- c()
  resp <- rowMeans(x)
  return(resp)
})
lapply(promedio_Down_DS, function(x) x %>% as.data.frame())
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]
NA

CTU19 Up

ctu19_Up_DS <- lapply(dataSets, function(x) {
  resp <- c()
  foreach(i = 1:10) %do% {resp <- resp %>% cbind(test_model_up(as.data.frame(x)))}
  return(resp)
})
promedio_Up_DS <- lapply(ctu19_Up_DS, function(x){
  resp <- c()
  resp <- rowMeans(x)
  return(resp)
})
lapply(promedio_Up_DS, function(x) x %>% as.data.frame())
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]
NA
---
title: "Botnets_CTU19_1"
output: html_notebook
---

```{r include=FALSE}
library(dplyr)
library(ggplot2)
library(lattice)
library(tidyr)
library(caret)
library(randomForest)
library(doParallel)
library(stringr)
```

# Primera Parte

```{r include=FALSE}
ctu19 = readr::read_delim(file="C:/Users/DuzzLogic/Google Drive/Cosas/LABSIN/Botnets/ctu/ctu19.csv",delim = ',')
```

## Importamos CTU19

```{r}
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
```


Sacamos los simbolos de tiempo.

```{r}
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'

ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
```


Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.

```{r echo=TRUE}
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
```

```{r}
ctu19 %>% head(50)
```

## Feature Vector

```{r echo=TRUE}
feature_vector <- function(dataset){
  #Size
  dataset <- dataset %>% mutate(small_s = str_count(State, '[a-c]') + str_count(State, '[A-C]') + str_count(State, '[r-t]') + str_count(State, '[R-T]') + str_count(State, '[1-3]'))
  dataset <- dataset %>% mutate(medium_s = str_count(State, '[d-f]') + str_count(State, '[D-F]') + str_count(State, '[u-w]') + str_count(State, '[U-W]') + str_count(State, '[4-6]'))
  dataset <- dataset %>% mutate(large_s = str_count(State, '[g-i]') + str_count(State, '[G-I]') + str_count(State, '[x-z]') + str_count(State, '[X-Z]') + str_count(State, '[7-9]'))
  
  #Periodicity
  dataset <- dataset %>% mutate(strong_p = str_count(State, '[a-i]'))
  dataset <- dataset %>% mutate(weak_p = str_count(State, '[A-I]'))
  dataset <- dataset %>% mutate(weak_np = str_count(State, '[r-z]'))
  dataset <- dataset %>% mutate(strong_np = str_count(State, '[R-Z]'))
  
  #Duration
  dataset <- dataset %>% mutate(short_d = str_count(State, '[a|A|r|R|1|d|D|u|U|4|g|G|x|X|7]'))
  dataset <- dataset %>% mutate(medium_d = str_count(State, '[b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8]'))
  dataset <- dataset %>% mutate(long_d = str_count(State, '[c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9]'))
  
}

```

```{r}
ctu19 <- feature_vector(ctu19)
```

```{r}
ctu19 %>% head(50)
```

Dejamos solo las columnas que nos interesan

```{r}
ctu19 <- ctu19 %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
ctu19$LabelName <- as.factor(ctu19$LabelName)
ctu19 %>% head(50)
```

## Random Forest

```{r echo=TRUE}
test_model <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=train_data,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
```


### CTU19 puro

```{r echo=TRUE}
ctu19Puro_CM <- c()
foreach(i = 1:10) %do% {ctu19Puro_CM <- ctu19Puro_CM %>% cbind(test_model(ctu19))}
```

```{r}
promedio_Puro <- c()
promedio_Puro <- rowMeans(ctu19Puro_CM)
promedio_Puro %>% as.data.frame()
```

### CTU19 con DownSampling

```{r echo=TRUE}
test_model_down <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  down_train_data <- downSample(train_data,train_data$LabelName, list=FALSE)
  down_train_data <- down_train_data %>% select(-Class)
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=down_train_data ,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
```

```{r echo=TRUE}
ctu19Down_CM <- c()
foreach(i = 1:10) %do% {ctu19Down_CM <- ctu19Down_CM %>% cbind(test_model_down(ctu19))}
```

```{r}
promedio_Down <- c()
promedio_Down <- rowMeans(ctu19Down_CM)
promedio_Down %>% as.data.frame()
```

### CTU19 con UpSampling

```{r echo=TRUE}
test_model_up <- function(ctu19){
  #Train 70% y 30%
  train_data_ind <- createDataPartition(ctu19$LabelName, p = 0.7,list = FALSE)
  train_data <- ctu19[train_data_ind,]
  test_data <- ctu19[-train_data_ind,]
  
  up_train_data <- upSample(train_data,train_data$LabelName, list=FALSE)
  up_train_data <- up_train_data %>% select(-Class)
  
  set.seed(123)
  modelo_randomforest <- randomForest(LabelName~., data=up_train_data ,na.action = na.omit)
  
  prediction <- predict(modelo_randomforest,test_data)
  conf_MAt <- confusionMatrix(prediction, test_data$LabelName)
  return(conf_MAt$byClass)
}
```

```{r echo=TRUE}
ctu19Up_CM <- c()
foreach(i = 1:10) %do% {ctu19Up_CM <- ctu19Up_CM %>% cbind(test_model_up(ctu19))}
```

```{r}
promedio_Up <- c()
promedio_Up <- rowMeans(ctu19Up_CM)
promedio_Up %>% as.data.frame() 
```

# Segunda Parte

```{r include=FALSE}
ctu19 = readr::read_delim(file="C:/Users/DuzzLogic/Google Drive/Cosas/LABSIN/Botnets/ctu/ctu19.csv",delim = ',')
```

```{r}
ctu19 <- ctu19 %>% select(-X1)
ctu19 %>% head(50)
```


Sacamos los simbolos de tiempo.

```{r}
time_simbol_pattern <- '(\\.)*(\\,)*(\\+)*(\\*)*(0)*'

ctu19$State <- ctu19$State %>% str_replace_all(time_simbol_pattern,"")
```


Eliminamos los primeros 4 simbolos. Obviamente, las secuencias de solo 4 simbolos son eliminadas.

```{r echo=TRUE}
ctu19 <- ctu19 %>% filter(nchar(State) > 4)
ctu19$State <- ctu19$State %>% substr(4,nchar(ctu19$State))
```

```{r}
ctu19 %>% head(50)
```

## Armamos los dataset de tamaño N

### Tamaño 10

```{r}
ctu19_10 <- ctu19 %>% filter(nchar(State) > 9)
ctu19_10$State <- ctu19_10$State %>% substr(1,10)
```


### Tamaño 20

```{r}
ctu19_20 <- ctu19 %>% filter(nchar(State) > 19)
ctu19_20$State <- ctu19_20$State %>% substr(1,20)
```


### Tamaño 40

```{r}
ctu19_40 <- ctu19 %>% filter(nchar(State) > 39)
ctu19_40$State <- ctu19_40$State %>% substr(1,40)
```


### Tamaño 80

```{r}
ctu19_80 <- ctu19 %>% filter(nchar(State) > 79)
ctu19_80$State <- ctu19_80$State %>% substr(1,80)
```


### Tamaño 100

```{r}
ctu19_100 <- ctu19 %>% filter(nchar(State) > 99)
ctu19_100$State <- ctu19_100$State %>% substr(1,100)
```


### Tamaño 200

```{r}
ctu19_200 <- ctu19 %>% filter(nchar(State) > 199)
ctu19_200$State <- ctu19_200$State %>% substr(1,200)
```


### Tamaño 400

```{r}
ctu19_400 <- ctu19 %>% filter(nchar(State) > 399)
ctu19_400$State <- ctu19_400$State %>% substr(1,400)
```


### Tamaño 800

```{r}
ctu19_800 <- ctu19 %>% filter(nchar(State) > 799)
ctu19_800$State <- ctu19_800$State %>% substr(1,800)
```


### Tamaño 1000

```{r}
ctu19_1000 <- ctu19 %>% filter(nchar(State) > 999)
ctu19_1000$State <- ctu19_1000$State %>% substr(1,1000)
```

## Vector de DataSets

```{r}
dataSets <- list(ctu19_10,ctu19_20,ctu19_40,ctu19_80,ctu19_100,ctu19_200,ctu19_400,ctu19_800,ctu19_1000)
```


## Feature Vector

```{r}
dataSets <- lapply(dataSets, function(x) x <- feature_vector(as.data.frame(x)))
```


## Limpieza

Dejamos solo las columnas que nos interesan

```{r}
limpieza <- function(x){
  x <- x %>% select(-InitialIp,-EndIP,-Proto,-State,-Port)
  x$LabelName <- as.factor(x$LabelName)
  return(x)
}
```

```{r}
dataSets <- lapply(dataSets, function(x) x <- limpieza(x))
```

```{r}
lapply(dataSets, function(x) as.data.frame(x) %>% head(50))
```


## Random Forest

### CTU19 puro

```{r}
ctu19_puro_DS <- lapply(dataSets, function(x) {
  ctu19Puro_DS <- c()
  foreach(i = 1:10) %do% {ctu19Puro_DS <- ctu19Puro_DS %>% cbind(test_model(as.data.frame(x)))}
  return(ctu19Puro_DS)
})
```

```{r}
promedio_Puro_DS <- lapply(ctu19_puro_DS, function(x){
  resp <- c()
  resp <- rowMeans(x)
  return(resp)
})
```

```{r}
lapply(promedio_Puro_DS, function(x) x %>% as.data.frame())
```


### CTU19 Down
```{r}
ctu19_Down_DS <- lapply(dataSets, function(x) {
  resp <- c()
  foreach(i = 1:10) %do% {resp <- resp %>% cbind(test_model_down(as.data.frame(x)))}
  return(resp)
})
```

```{r}
promedio_Down_DS <- lapply(ctu19_Down_DS, function(x){
  resp <- c()
  resp <- rowMeans(x)
  return(resp)
})
```

```{r}
lapply(promedio_Down_DS, function(x) x %>% as.data.frame())
```


### CTU19 Up

```{r}
ctu19_Up_DS <- lapply(dataSets, function(x) {
  resp <- c()
  foreach(i = 1:10) %do% {resp <- resp %>% cbind(test_model_up(as.data.frame(x)))}
  return(resp)
})
```

```{r}
promedio_Up_DS <- lapply(ctu19_Up_DS, function(x){
  resp <- c()
  resp <- rowMeans(x)
  return(resp)
})
```

```{r}
lapply(promedio_Up_DS, function(x) x %>% as.data.frame())
```

