La finalidad de este proyecto es comparar los distintos algoritmos y como predicen la afección de cotonet en kaki.

1. Carga de Datos

Se cargan los 4 años de estudio en distintos objetos

library(readxl)
kaki_19 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2019")

kaki_20 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2020")

kaki_21 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2021")

kaki_22 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2022")

2. Filtrado de los datos

Se procede a eliminar y organizar los datos de los data frames para un análisis correcto

2.1. Acotado de las fechas deseadas

Se decide por motivos agronómicos que las fechas de estudio se estableceran entre abril y septiembre

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(Matrix)
library(scatterplot3d)

dim(kaki_19)
## [1] 3903  163
#AÑO 2019


Split <- str_split(names(kaki_19), "_",simplify = FALSE)

Fecha <- sapply(Split, function(x) ifelse(length(x) > 1, x[[2]], NA))

Lista_19 <- as.list(setNames(Fecha, names(kaki_19)))

kaki_19_1 <- rbind(kaki_19, Lista_19)


kaki_19_Int <- kaki_19_1 %>%
  select(which(kaki_19_1[3904, ] >= 190401 & kaki_19_1[3904, ] <= 190930))


Kaki_19_INTERES <- kaki_19_Int[1:3903,]

kaki_19_Muestreo <- kaki_19_1 [1:3903,1:3]

kaki_19_TOT <- bind_cols( kaki_19_Muestreo,Kaki_19_INTERES)


kaki_19_TOT <- mutate_all(kaki_19_TOT, as.numeric)

kaki_19_TOT$GC<-as.factor(kaki_19_TOT$GC)
kaki_19_TOT$FieD<-as.factor(kaki_19_TOT$FieD)
kaki_19_TOT$PID<-as.factor(kaki_19_TOT$PID)

dim(kaki_19_TOT)
## [1] 3903   88
#AÑO 2020


Split_20 <- str_split(names(kaki_20), "_",simplify = FALSE)

Fecha_20 <- sapply(Split_20, function(x) ifelse(length(x) > 1, x[[2]], NA))

Lista_20 <- as.list(setNames(Fecha_20, names(kaki_20)))

kaki_20_1 <- rbind(kaki_20, Lista_20)

kaki_20_Int <- kaki_20_1 %>%
  select(which(kaki_20_1[3904, ] >= 200401 & kaki_20_1[3904, ] <= 200930))

Kaki_20_INTERES <- kaki_20_Int[1:3903,]

kaki_20_Muestreo <- kaki_20_1 [1:3903,1:3]

kaki_20_TOT <- bind_cols( kaki_20_Muestreo, Kaki_20_INTERES)

kaki_20_TOT <- mutate_all(kaki_20_TOT, as.numeric)

kaki_20_TOT$GC<-as.factor(kaki_20_TOT$GC)
kaki_20_TOT$FieD<-as.factor(kaki_20_TOT$FieD)
kaki_20_TOT$PID<-as.factor(kaki_20_TOT$PID)


#AÑO 2021


Split_21 <- str_split(names(kaki_21), "_",simplify = FALSE)

Fecha_21 <- sapply(Split_21, function(x) ifelse(length(x) > 1, x[[2]], NA))

Lista_21 <- as.list(setNames(Fecha_21, names(kaki_21)))

kaki_21_1 <- rbind(kaki_21, Lista_21)

kaki_21_Int <- kaki_21_1 %>%
  select(which(kaki_21_1[3904, ] >= 210401 & kaki_21_1[3904, ] <= 210930))

Kaki_21_INTERES <- kaki_21_Int[1:3903,]

kaki_21_Muestreo <- kaki_21_1 [1:3903,1:3]

kaki_21_TOT <- bind_cols( kaki_21_Muestreo, Kaki_21_INTERES)

kaki_21_TOT <- mutate_all(kaki_21_TOT, as.numeric)

kaki_21_TOT$GC<-as.factor(kaki_21_TOT$GC)
kaki_21_TOT$FieD<-as.factor(kaki_21_TOT$FieD)
kaki_21_TOT$PID<-as.factor(kaki_21_TOT$PID)


#AÑO 2022


Split_22 <- str_split(names(kaki_22), "_",simplify = FALSE)

Fecha_22 <- sapply(Split_22, function(x) ifelse(length(x) > 1, x[[2]], NA))

Lista_22 <- as.list(setNames(Fecha_22, names(kaki_22)))

kaki_22_1 <- rbind(kaki_22, Lista_22)

kaki_22_Int <- kaki_22_1 %>%
  select(which(kaki_22_1[3904, ] >= 220401 & kaki_22_1[3904, ] <= 220930))

Kaki_22_INTERES <- kaki_22_Int[1:3903,]

kaki_22_Muestreo <- kaki_22_1 [1:3903,1:3]

kaki_22_TOT <- bind_cols( kaki_22_Muestreo, Kaki_22_INTERES)

kaki_22_TOT <- mutate_all(kaki_22_TOT, as.numeric)

kaki_22_TOT$GC<-as.factor(kaki_22_TOT$GC)
kaki_22_TOT$FieD<-as.factor(kaki_22_TOT$FieD)
kaki_22_TOT$PID<-as.factor(kaki_22_TOT$PID)

2.2. Derretido de la matriz

Se necesita la fecha como variable por lo que debemos derretir la matriz con el comando melt del paquete reshape2

library(reshape2)

#MELT AÑO 2019


kaki_19_B2 <- kaki_19_TOT [,grep("^B02",colnames(kaki_19_TOT))]

kaki_19_B3 <- kaki_19_TOT [,grep("^B03",colnames(kaki_19_TOT))]

kaki_19_B4 <- kaki_19_TOT [,grep("^B04",colnames(kaki_19_TOT))]

kaki_19_B8 <- kaki_19_TOT [,grep("^B08",colnames(kaki_19_TOT))]

kaki_19_NDVI <- kaki_19_TOT [,grep("^NDVI",colnames(kaki_19_TOT))]


kaki_19_B2_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B2)

kaki_19_B3_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B3)

kaki_19_B4_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B4)

kaki_19_B8_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B8)

kaki_19_NDVI_2 <- bind_cols(kaki_19_Muestreo,kaki_19_NDVI)



MeltB2 <- melt(kaki_19_B2_2)
## Using GC, FieD, PID as id variables
colnames(MeltB2)[5] <- "B02"

colnames(MeltB2)[4] <- "Fecha"

MeltB2$Fecha <- sub("B02_", "", MeltB2$Fecha)


MeltB3 <- melt(kaki_19_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3)[5] <- "B03"

colnames(MeltB3)[4] <- "Fecha"

MeltB3$Fecha <- sub("B03_", "", MeltB3$Fecha)


MeltB4 <- melt(kaki_19_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4)[5] <- "B04"

colnames(MeltB4)[4] <- "Fecha"

MeltB4$Fecha <- sub("B04_", "", MeltB4$Fecha)


MeltB8 <- melt(kaki_19_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8)[5] <- "B08"

colnames(MeltB8)[4] <- "Fecha"

MeltB8$Fecha <- sub("B08_", "", MeltB8$Fecha)


MeltNDVI <- melt(kaki_19_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI)[5] <- "NDVI"

colnames(MeltNDVI)[4] <- "Fecha"

MeltNDVI$Fecha <- sub("NDVI_", "", MeltNDVI$Fecha)


kaki_def19 <- dplyr::bind_cols(MeltB2, MeltB3[, -c(1:4)], MeltB4[, -c(1:4)], MeltB8[, -c(1:4)], MeltNDVI[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def19)[6] <- "B03"

colnames(kaki_def19)[7] <- "B04"

colnames(kaki_def19)[8] <- "B08"

colnames(kaki_def19)[9] <- "NDVI"

kaki_def19$GC<-as.factor(kaki_def19$GC)
kaki_def19$FieD<-as.factor(kaki_def19$FieD)
kaki_def19$PID<-as.factor(kaki_def19$PID)

dim(kaki_def19)
## [1] 66351     9
#MELT AÑO 2020


kaki_20_B2 <- kaki_20_TOT [,grep("^B02",colnames(kaki_20_TOT))]

kaki_20_B3 <- kaki_20_TOT [,grep("^B03",colnames(kaki_20_TOT))]

kaki_20_B4 <- kaki_20_TOT [,grep("^B04",colnames(kaki_20_TOT))]

kaki_20_B8 <- kaki_20_TOT [,grep("^B08",colnames(kaki_20_TOT))]

kaki_20_NDVI <- kaki_20_TOT [,grep("^NDVI",colnames(kaki_20_TOT))]


kaki_20_B2_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B2)

kaki_20_B3_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B3)

kaki_20_B4_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B4)

kaki_20_B8_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B8)

kaki_20_NDVI_2 <- bind_cols(kaki_20_Muestreo,kaki_20_NDVI)




MeltB2_20 <- melt(kaki_20_B2_2,) 
## Using GC, FieD, PID as id variables
colnames(MeltB2_20)[5] <- "B02"

colnames(MeltB2_20)[4] <- "Fecha"

MeltB2_20$Fecha <- sub("B02_", "", MeltB2_20$Fecha)


MeltB3_20 <- melt(kaki_20_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3_20)[5] <- "B03"

colnames(MeltB3_20)[4] <- "Fecha"

MeltB3_20$Fecha <- sub("B03_", "", MeltB3_20$Fecha)


MeltB4_20 <- melt(kaki_20_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4_20)[5] <- "B04"

colnames(MeltB4_20)[4] <- "Fecha"

MeltB4_20$Fecha <- sub("B04_", "", MeltB4_20$Fecha)


MeltB8_20 <- melt(kaki_20_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8_20)[5] <- "B08"

colnames(MeltB8_20)[4] <- "Fecha"

MeltB8_20$Fecha <- sub("B08_", "", MeltB8_20$Fecha)


MeltNDVI_20 <- melt(kaki_20_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI_20)[5] <- "NDVI"

colnames(MeltNDVI_20)[4] <- "Fecha"

MeltNDVI_20$Fecha <- sub("NDVI_", "", MeltNDVI_20$Fecha)


kaki_def20 <- dplyr::bind_cols(MeltB2_20, MeltB3_20[, -c(1:4)], MeltB4_20[, -c(1:4)], MeltB8_20[, -c(1:4)], MeltNDVI_20[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def20)[6] <- "B03"

colnames(kaki_def20)[7] <- "B04"

colnames(kaki_def20)[8] <- "B08"

colnames(kaki_def20)[9] <- "NDVI"

kaki_def20$GC<-as.factor(kaki_def20$GC)
kaki_def20$FieD<-as.factor(kaki_def20$FieD)
kaki_def20$PID<-as.factor(kaki_def20$PID)


#MELT AÑO 2021


kaki_21_B2 <- kaki_21_TOT [,grep("^B02",colnames(kaki_21_TOT))]

kaki_21_B3 <- kaki_21_TOT [,grep("^B03",colnames(kaki_21_TOT))]

kaki_21_B4 <- kaki_21_TOT [,grep("^B04",colnames(kaki_21_TOT))]

kaki_21_B8 <- kaki_21_TOT [,grep("^B08",colnames(kaki_21_TOT))]

kaki_21_NDVI <- kaki_21_TOT [,grep("^NDVI",colnames(kaki_21_TOT))]


kaki_21_B2_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B2)

kaki_21_B3_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B3)

kaki_21_B4_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B4)

kaki_21_B8_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B8)

kaki_21_NDVI_2 <- bind_cols(kaki_21_Muestreo,kaki_21_NDVI)


MeltB2_21 <- melt(kaki_21_B2_2)
## Using GC, FieD, PID as id variables
colnames(MeltB2_21)[5] <- "B02"

colnames(MeltB2_21)[4] <- "Fecha"

MeltB2_21$Fecha <- sub("B02_", "", MeltB2_21$Fecha)


MeltB3_21 <- melt(kaki_21_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3_21)[5] <- "B03"

colnames(MeltB3_21)[4] <- "Fecha"

MeltB3_21$Fecha <- sub("B03_", "", MeltB3_21$Fecha)


MeltB4_21 <- melt(kaki_21_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4_21)[5] <- "B04"

colnames(MeltB4_21)[4] <- "Fecha"

MeltB4_21$Fecha <- sub("B04_", "", MeltB4_21$Fecha)


MeltB8_21 <- melt(kaki_21_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8_21)[5] <- "B08"

colnames(MeltB8_21)[4] <- "Fecha"

MeltB8_21$Fecha <- sub("B08_", "", MeltB8_21$Fecha)


MeltNDVI_21 <- melt(kaki_21_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI_21)[5] <- "NDVI"

colnames(MeltNDVI_21)[4] <- "Fecha"

MeltNDVI_21$Fecha <- sub("NDVI_", "", MeltNDVI_21$Fecha)


kaki_def21 <- dplyr::bind_cols(MeltB2_21, MeltB3_21[, -c(1:4)], MeltB4_21[, -c(1:4)], MeltB8_21[, -c(1:4)], MeltNDVI_21[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def21)[6] <- "B03"

colnames(kaki_def21)[7] <- "B04"

colnames(kaki_def21)[8] <- "B08"

colnames(kaki_def21)[9] <- "NDVI"

kaki_def21$GC<-as.factor(kaki_def21$GC)
kaki_def21$FieD<-as.factor(kaki_def21$FieD)
kaki_def21$PID<-as.factor(kaki_def21$PID)


#MELT AÑO 2022


kaki_22_B2 <- kaki_22_TOT [,grep("^B02",colnames(kaki_22_TOT))]

kaki_22_B3 <- kaki_22_TOT [,grep("^B03",colnames(kaki_22_TOT))]

kaki_22_B4 <- kaki_22_TOT [,grep("^B04",colnames(kaki_22_TOT))]

kaki_22_B8 <- kaki_22_TOT [,grep("^B08",colnames(kaki_22_TOT))]

kaki_22_NDVI <- kaki_22_TOT [,grep("^NDVI",colnames(kaki_22_TOT))]


kaki_22_B2_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B2)

kaki_22_B3_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B3)

kaki_22_B4_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B4)

kaki_22_B8_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B8)

kaki_22_NDVI_2 <- bind_cols(kaki_22_Muestreo,kaki_22_NDVI)


MeltB2_22 <- melt(kaki_22_B2_2)
## Using GC, FieD, PID as id variables
colnames(MeltB2_22)[5] <- "B02"

colnames(MeltB2_22)[4] <- "Fecha"

MeltB2_22$Fecha <- sub("B02_", "", MeltB2_22$Fecha)


MeltB3_22 <- melt(kaki_22_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3_22)[5] <- "B03"

colnames(MeltB3_22)[4] <- "Fecha"

MeltB3_22$Fecha <- sub("B03_", "", MeltB3_22$Fecha)


MeltB4_22 <- melt(kaki_22_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4_22)[5] <- "B04"

colnames(MeltB4_22)[4] <- "Fecha"

MeltB4_22$Fecha <- sub("B04_", "", MeltB4_22$Fecha)


MeltB8_22 <- melt(kaki_22_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8_22)[5] <- "B08"

colnames(MeltB8_22)[4] <- "Fecha"

MeltB8_22$Fecha <- sub("B08_", "", MeltB8_22$Fecha)


MeltNDVI_22 <- melt(kaki_22_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI_22)[5] <- "NDVI"

colnames(MeltNDVI_22)[4] <- "Fecha"

MeltNDVI_22$Fecha <- sub("NDVI_", "", MeltNDVI_22$Fecha)


kaki_def22 <- dplyr::bind_cols(MeltB2_22, MeltB3_22[, -c(1:4)], MeltB4_22[, -c(1:4)], MeltB8_22[, -c(1:4)], MeltNDVI_22[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def22)[6] <- "B03"

colnames(kaki_def22)[7] <- "B04"

colnames(kaki_def22)[8] <- "B08"

colnames(kaki_def22)[9] <- "NDVI"

kaki_def22$GC<-as.factor(kaki_def22$GC)
kaki_def22$FieD<-as.factor(kaki_def22$FieD)
kaki_def22$PID<-as.factor(kaki_def22$PID)

2.3. Formación final de la base

En este paso se obtiene la base final con todos los años y cada observación con el mes

kakiDEF <- rbind(kaki_def19, kaki_def20, kaki_def21, kaki_def22)

kakiDEF$MES <- str_sub(kakiDEF$Fecha, start = 3, end = 4)

kakiDEF$MES<-as.factor(kakiDEF$MES)
kakiDEF$PID<-as.character(kakiDEF$PID)
kakiDEF$FieD<-as.character(kakiDEF$FieD)

kakiDEF_2 <- kakiDEF[,-c(2,3)]
str(kakiDEF_2)
## 'data.frame':    171732 obs. of  8 variables:
##  $ GC   : Factor w/ 2 levels "1","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Fecha: chr  "190411" "190411" "190411" "190411" ...
##  $ B02  : num  655 658 630 640 590 616 727 712 680 648 ...
##  $ B03  : num  1062 1050 995 1024 991 ...
##  $ B04  : num  1288 1324 1276 1266 1120 ...
##  $ B08  : num  3130 3024 2958 2890 2996 ...
##  $ NDVI : num  0.417 0.391 0.397 0.391 0.456 ...
##  $ MES  : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
table(kakiDEF$GC, kakiDEF$MES)
##    
##        04    05    06    07    08    09
##   1 11100 13875 22200 36075 27750 11100
##   3  4512  5640  9024 14664 11280  4512

Como en la base kakiDEF hay muchos casos de GC = 1, los algoritmos no van a clasificar bien por lo que debemos balancear la base con los mismos casos de GC por MES

Comprobación y eliminación de datos duplicados

Mes4_1 <- kakiDEF[which(kakiDEF$MES=="04" & kakiDEF$GC == "1"),]
index4 <- sample(1:nrow(Mes4_1), size=4512, replace = F)
Mes4_1Red <- Mes4_1[index4,]


Mes5_1 <- kakiDEF[which(kakiDEF$MES=="05" & kakiDEF$GC == "1"),]
index5 <- as.numeric(sample(1:nrow(Mes5_1), size=5640, replace = F))
Mes5_1Red <- Mes5_1[index5,]


Mes6_1 <- kakiDEF[which(kakiDEF$MES=="06" & kakiDEF$GC == "1"),]
index6 <- as.numeric(sample(1:nrow(Mes6_1), size=9024, replace = F))
Mes6_1Red <- Mes6_1[index6,]


Mes7_1 <- kakiDEF[which(kakiDEF$MES=="07" & kakiDEF$GC == "1"),]
index7 <- as.numeric(sample(1:nrow(Mes7_1), size=14664, replace = F))
Mes7_1Red <- Mes7_1[index7,]


Mes8_1 <- kakiDEF[which(kakiDEF$MES=="08" & kakiDEF$GC == "1"),]
index8 <- as.numeric(sample(1:nrow(Mes8_1), size=11280, replace = F))
Mes8_1Red <- Mes8_1[index8,]

Mes9_1 <- kakiDEF[which(kakiDEF$MES=="09" & kakiDEF$GC == "1"),]
index9 <- as.numeric(sample(1:nrow(Mes9_1), size=4512, replace = F))
Mes9_1Red <- Mes9_1[index9,]

gc3 <- kakiDEF[which(kakiDEF$GC == "3"),]

kakiDEFbalanceV2 <- rbind(gc3,Mes4_1Red, Mes5_1Red, Mes6_1Red, Mes7_1Red, Mes8_1Red, Mes9_1Red)

kakiDEFbalance <- kakiDEFbalanceV2[,-c(2,3,4)]

dim(kakiDEFbalance)
## [1] 99264     7
table(kakiDEFbalance$GC, kakiDEFbalance$MES)
##    
##        04    05    06    07    08    09
##   1  4512  5640  9024 14664 11280  4512
##   3  4512  5640  9024 14664 11280  4512
duplicated_rows <- kakiDEFbalance[duplicated(kakiDEFbalance), ]
duplicated_rows_sin_balance <- kakiDEF[duplicated(kakiDEF), ]

3. Clasificación no supervisada

El algoritmo elegido para la clasificación no supervisada es un PCA

3.1. PCA

library(Matrix)
library(scatterplot3d)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(dplyr)
library(factoextra)
library(cluster)
library(simstudy)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ tidyr::expand()       masks Matrix::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ tidyr::pack()         masks Matrix::pack()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ car::recode()         masks dplyr::recode()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::some()         masks car::some()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ tidyr::unpack()       masks Matrix::unpack()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
library(corrr)
library(dbscan)
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(FactoMineR)

A=kakiDEFbalance[,-c(1,7)]
table(kakiDEFbalance$GC)
## 
##     1     3 
## 49632 49632
A <- mutate_all(A, as.numeric)
cor(A)
##             B02        B03        B04       B08       NDVI
## B02   1.0000000  0.9777877  0.9050390 0.4520237 -0.6975540
## B03   0.9777877  1.0000000  0.9384456 0.4989298 -0.7196764
## B04   0.9050390  0.9384456  1.0000000 0.3761287 -0.8651939
## B08   0.4520237  0.4989298  0.3761287 1.0000000  0.1027268
## NDVI -0.6975540 -0.7196764 -0.8651939 0.1027268  1.0000000
duplicated_rows_A <- A[duplicated(A), ]
pca<-prcomp(A,scale=T)  

eigenvalues<-get_eig(pca) 
eigenvalues
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.71114035       74.2228069                    74.22281
## Dim.2 1.11002152       22.2004305                    96.42324
## Dim.3 0.15280323        3.0560647                    99.47930
## Dim.4 0.01484578        0.2969155                    99.77622
## Dim.5 0.01118912        0.2237824                   100.00000
fviz_eig(pca,choice = "variance", addlabels = TRUE)

Autovectores=pca$rotation
head(Autovectores)
##             PC1         PC2        PC3         PC4          PC5
## B02   0.5006346 -0.06309185 -0.6278023 -0.59265380 -0.003191517
## B03   0.5107112 -0.08545273 -0.3096415  0.76964238 -0.208912933
## B04   0.5095898  0.09064235  0.3660000  0.02894989  0.772857129
## B08   0.2323077 -0.83451204  0.4101063 -0.14803723 -0.243968428
## NDVI -0.4181990 -0.53300175 -0.4558983  0.18346379  0.547280974
result.var <- get_pca_var(pca)
Loadings=result.var$coord 
result.var$cor 
##           Dim.1       Dim.2      Dim.3        Dim.4         Dim.5
## B02   0.9644386 -0.06647203 -0.2454082 -0.072210861 -0.0003375943
## B03   0.9838505 -0.09003091 -0.1210390  0.093775723 -0.0220985251
## B04   0.9816900  0.09549857  0.1430696  0.003527349  0.0817517730
## B08   0.4475250 -0.87922152  0.1603108 -0.018037336 -0.0258066475
## NDVI -0.8056320 -0.56155763 -0.1782109  0.022353823  0.0578906350
fviz_pca_var(pca,geom=c("arrow", "text"),col.var="contrib",gradient.cols=c("#00AFBB","#E7B800","#FC4E07"),repel = TRUE)

Contribuciones=result.var$contrib        
#Visualizar la contribución (%) de cada variable a las componentes obtenidas
a=fviz_contrib(pca, choice = "var", axes = 1, top = 18)+
  theme(axis.text.x = element_text(angle=90))
b=fviz_contrib(pca, choice = "var", axes = 2, top = 18)+
  theme(axis.text.x = element_text(angle=90))

grid.arrange(a,b, ncol=1, top='Contribution of the variables to the first two PCs')

table(kakiDEFbalance$GC)
## 
##     1     3 
## 49632 49632
fviz_pca_ind(pca,geom=c("point"),alpha.ind = 0.1, xlab="PC 1", ylab ="PC 2", col.ind=factor(kakiDEFbalance$GC))

En conclusión, el PCA no nos sirve ya que el estado de GC 1 y 3 se ven influidas por las mismas variables, como se puede observar en el gráfico anterior

4. Algoritmos de clasificación supervisada

4.1. Árbol de clasificación

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(modelr)
library(data.table)
library(dplyr)
library(ggplot2)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(lattice)
str(kakiDEF)
## 'data.frame':    171732 obs. of  10 variables:
##  $ GC   : Factor w/ 2 levels "1","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ FieD : chr  "55" "55" "55" "55" ...
##  $ PID  : chr  "1239" "1240" "1233" "1234" ...
##  $ Fecha: chr  "190411" "190411" "190411" "190411" ...
##  $ B02  : num  655 658 630 640 590 616 727 712 680 648 ...
##  $ B03  : num  1062 1050 995 1024 991 ...
##  $ B04  : num  1288 1324 1276 1266 1120 ...
##  $ B08  : num  3130 3024 2958 2890 2996 ...
##  $ NDVI : num  0.417 0.391 0.397 0.391 0.456 ...
##  $ MES  : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
set.seed(30)
index <- createDataPartition(kakiDEFbalance$GC, p = 0.7, list = FALSE)
train_kakiDEFbalance <- kakiDEFbalance[index, ]
test_kakiDEFbalance<- kakiDEFbalance[-index, ]
nrow(train_kakiDEFbalance)
## [1] 69486
head(train_kakiDEFbalance)
##      GC B02  B03  B04  B08      NDVI MES
## 2776  3 561 1008 1288 3392 0.4495726  04
## 2777  3 774 1278 1582 3588 0.3880077  04
## 2778  3 741 1220 1406 3544 0.4319192  04
## 2779  3 597 1064 1198 3474 0.4871575  04
## 2780  3 471 1036 1210 3398 0.4748264  04
## 2781  3 499  949 1168 3326 0.4801958  04
nrow(test_kakiDEFbalance)
## [1] 29778

Optimización de CP, grado de complejidad del árbol

CTree <- rpart(formula = GC ~ .,
               data = train_kakiDEFbalance,
               method = "class",
               control = rpart.control(minsplit = 200,
                                       minbucket = 100,
                                       cp = 0.005,
                                       xval = 10),
               parms = list(split = "gini"))
printcp(CTree)
## 
## Classification tree:
## rpart(formula = GC ~ ., data = train_kakiDEFbalance, method = "class", 
##     parms = list(split = "gini"), control = rpart.control(minsplit = 200, 
##         minbucket = 100, cp = 0.005, xval = 10))
## 
## Variables actually used in tree construction:
## [1] B02  B03  B04  B08  MES  NDVI
## 
## Root node error: 34743/69486 = 0.5
## 
## n= 69486 
## 
##          CP nsplit rel error  xerror      xstd
## 1 0.1651268      0   1.00000 1.00662 0.0037935
## 2 0.1044527      1   0.83487 0.83683 0.0037428
## 3 0.0105345      2   0.73042 0.73405 0.0036570
## 4 0.0092393      3   0.71989 0.69623 0.0036143
## 5 0.0056702      8   0.66718 0.68022 0.0035944
## 6 0.0050000      9   0.66151 0.67697 0.0035902
plotcp(CTree)

rpart.plot(CTree) 

CTree_2 <- rpart(formula = GC ~ .,
                 data = train_kakiDEFbalance,
                 method = "class",
                 control = rpart.control(minsplit = 200,
                                         minbucket = 100,
                                         cp = 0.005,
                                         xval = 0),
                 parms = list(split = "gini"))



rpart.plot(CTree_2)

Representación del peso de las variables

varImp(CTree_2)
##        Overall
## B02  4018.3774
## B03  4117.1170
## B04  3649.3177
## B08  3763.1234
## MES   204.6096
## NDVI 2551.6051
str(train_kakiDEFbalance)
## 'data.frame':    69486 obs. of  7 variables:
##  $ GC  : Factor w/ 2 levels "1","3": 2 2 2 2 2 2 2 2 2 2 ...
##  $ B02 : num  561 774 741 597 471 499 453 907 728 898 ...
##  $ B03 : num  1008 1278 1220 1064 1036 ...
##  $ B04 : num  1288 1582 1406 1198 1210 ...
##  $ B08 : num  3392 3588 3544 3474 3398 ...
##  $ NDVI: num  0.45 0.388 0.432 0.487 0.475 ...
##  $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
classes_train <- predict(CTree_2, train_kakiDEFbalance) 
str(classes_train)
##  num [1:69486, 1:2] 0.367 0.367 0.367 0.367 0.367 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:69486] "2776" "2777" "2778" "2779" ...
##   ..$ : chr [1:2] "1" "3"
classes_train<-as.data.frame(classes_train)
colnames(classes_train) <- c("NOAFECTADO","AFECTADO")


classes_test <- predict(CTree_2, test_kakiDEFbalance) 
str(classes_test)
##  num [1:29778, 1:2] 0.367 0.44 0.44 0.367 0.367 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:29778] "2782" "2785" "2786" "2796" ...
##   ..$ : chr [1:2] "1" "3"
classes_test<-as.data.frame(classes_test)
colnames(classes_test) <- c("NOAFECTADO","AFECTADO")

Resultados

GCpred <- as.factor(ifelse(classes_train$NOAFECTADO >= 0.5, "1", "3"))
confusionMatrix(GCpred, train_kakiDEFbalance$GC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     3
##          1 18242  6482
##          3 16501 28261
##                                           
##                Accuracy : 0.6692          
##                  95% CI : (0.6657, 0.6727)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3385          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5251          
##             Specificity : 0.8134          
##          Pos Pred Value : 0.7378          
##          Neg Pred Value : 0.6314          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2625          
##    Detection Prevalence : 0.3558          
##       Balanced Accuracy : 0.6692          
##                                           
##        'Positive' Class : 1               
## 
GCpredTEST<- as.factor(ifelse(classes_test$NOAFECTADO >= 0.5, "1", "3"))
confusionMatrix(GCpredTEST, test_kakiDEFbalance$GC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     3
##          1  7851  2748
##          3  7038 12141
##                                          
##                Accuracy : 0.6714         
##                  95% CI : (0.666, 0.6767)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3427         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.5273         
##             Specificity : 0.8154         
##          Pos Pred Value : 0.7407         
##          Neg Pred Value : 0.6330         
##              Prevalence : 0.5000         
##          Detection Rate : 0.2637         
##    Detection Prevalence : 0.3559         
##       Balanced Accuracy : 0.6714         
##                                          
##        'Positive' Class : 1              
## 

4.2. AdaBoost

library(adabag)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(caret)
library(tree)
library(ada)
library(rpart.plot)
library(rpart.plot)
library(ggplot2)
library(lattice)
library(doParallel)
library(iterators)
library(parallel)
library(rpart)


str(kakiDEF)
## 'data.frame':    171732 obs. of  10 variables:
##  $ GC   : Factor w/ 2 levels "1","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ FieD : chr  "55" "55" "55" "55" ...
##  $ PID  : chr  "1239" "1240" "1233" "1234" ...
##  $ Fecha: chr  "190411" "190411" "190411" "190411" ...
##  $ B02  : num  655 658 630 640 590 616 727 712 680 648 ...
##  $ B03  : num  1062 1050 995 1024 991 ...
##  $ B04  : num  1288 1324 1276 1266 1120 ...
##  $ B08  : num  3130 3024 2958 2890 2996 ...
##  $ NDVI : num  0.417 0.391 0.397 0.391 0.456 ...
##  $ MES  : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
kakiDEFsolo19 <- kakiDEF[substr(kakiDEF$Fecha,1,2)=="19",]

kakiDEFbalance$GC <- as.factor(kakiDEFbalance$GC)
kakiDEFbalance$B02 <- as.numeric(kakiDEFbalance$B02)
kakiDEFbalance$B03 <- as.numeric(kakiDEFbalance$B03)
kakiDEFbalance$B04 <- as.numeric(kakiDEFbalance$B04)
kakiDEFbalance$B08 <- as.numeric(kakiDEFbalance$B08)
kakiDEFbalance$NDVI <- as.numeric(kakiDEFbalance$NDVI)
kakiDEFbalance$MES <- as.factor(kakiDEFbalance$MES)

set.seed(14)
index <- createDataPartition(kakiDEFbalance$GC, p = 0.70, list = FALSE)
train <- kakiDEFbalance[index, ]
test <- kakiDEFbalance[-index, ]
nrow(train)
## [1] 69486
nrow(test)
## [1] 29778
tuneGrid <- expand.grid(
  iter = c(5, 10, 15),  
  maxdepth = c(1, 2),    
  nu = c(0.01, 0.001) )

ctrl <- trainControl(method = "cv", number = 5) # Define las caracteristicasd de la validación cruzada


adaboost_model <- train(GC~ ., data = train, method = "ada", tuneGrid = tuneGrid, trControl = ctrl)
adaboost_model
## Boosted Classification Trees 
## 
## 69486 samples
##     6 predictor
##     2 classes: '1', '3' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 55588, 55588, 55589, 55590, 55589 
## Resampling results across tuning parameters:
## 
##   nu     maxdepth  iter  Accuracy   Kappa    
##   0.001  1          5    0.5808794  0.1617582
##   0.001  1         10    0.5662005  0.1324044
##   0.001  1         15    0.5726911  0.1453820
##   0.001  2          5    0.6355811  0.2711618
##   0.001  2         10    0.6364446  0.2728885
##   0.001  2         15    0.6367180  0.2734355
##   0.010  1          5    0.5673663  0.1347288
##   0.010  1         10    0.5608905  0.1217773
##   0.010  1         15    0.5594081  0.1188123
##   0.010  2          5    0.6356098  0.2712190
##   0.010  2         10    0.6349046  0.2698086
##   0.010  2         15    0.6341994  0.2683981
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were iter = 15, maxdepth = 2 and nu = 0.001.
optimal_params<-adaboost_model[["bestTune"]]
optimal_params # Se extraen los hiperparametros óptimos
##   iter maxdepth    nu
## 6   15        2 0.001

Resultados:

final_adaboost_model <- ada(
  GC~ ., 
  data = train, 
  iter = 5, 
  nu = 0.001, #Learning rate (Cantidad de errores cometidos por el modelo que se deben corregir)
  control = rpart.control(maxdepth = 2))


predictions_train <- predict(final_adaboost_model, newdata = train)
confusion_matrix_train <- confusionMatrix(predictions_train, train$GC)
print(confusion_matrix_train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     3
##          1 12556  3471
##          3 22187 31272
##                                           
##                Accuracy : 0.6307          
##                  95% CI : (0.6271, 0.6343)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2615          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3614          
##             Specificity : 0.9001          
##          Pos Pred Value : 0.7834          
##          Neg Pred Value : 0.5850          
##              Prevalence : 0.5000          
##          Detection Rate : 0.1807          
##    Detection Prevalence : 0.2307          
##       Balanced Accuracy : 0.6307          
##                                           
##        'Positive' Class : 1               
## 
predictions <- predict(final_adaboost_model, newdata = test)
confusion_matrix <- confusionMatrix(predictions, test$GC)
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     3
##          1  5512  1500
##          3  9377 13389
##                                           
##                Accuracy : 0.6347          
##                  95% CI : (0.6292, 0.6402)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2695          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3702          
##             Specificity : 0.8993          
##          Pos Pred Value : 0.7861          
##          Neg Pred Value : 0.5881          
##              Prevalence : 0.5000          
##          Detection Rate : 0.1851          
##    Detection Prevalence : 0.2355          
##       Balanced Accuracy : 0.6347          
##                                           
##        'Positive' Class : 1               
## 

4.3. Random Forest

library(randomForest)
library (caret)
library(adabag)
library(tree)


set.seed(33)
index <- createDataPartition(kakiDEFbalance$GC, p = 0.70, list = FALSE)

train <- kakiDEFbalance[index, ]
str(train)
## 'data.frame':    69486 obs. of  7 variables:
##  $ GC  : Factor w/ 2 levels "1","3": 2 2 2 2 2 2 2 2 2 2 ...
##  $ B02 : num  741 597 471 499 464 453 907 846 802 898 ...
##  $ B03 : num  1220 1064 1036 949 925 ...
##  $ B04 : num  1406 1198 1210 1168 1064 ...
##  $ B08 : num  3544 3474 3398 3326 3242 ...
##  $ NDVI: num  0.432 0.487 0.475 0.48 0.506 ...
##  $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
table(train$GC)
## 
##     1     3 
## 34743 34743
test <- kakiDEFbalance[-index, ]
str(test)
## 'data.frame':    29778 obs. of  7 variables:
##  $ GC  : Factor w/ 2 levels "1","3": 2 2 2 2 2 2 2 2 2 2 ...
##  $ B02 : num  561 774 728 756 613 546 529 425 753 711 ...
##  $ B03 : num  1008 1278 1236 1246 1110 ...
##  $ B04 : num  1288 1582 1460 1858 1516 ...
##  $ B08 : num  3392 3588 3602 3520 3494 ...
##  $ NDVI: num  0.45 0.388 0.423 0.309 0.395 ...
##  $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
table(test$GC)
## 
##     1     3 
## 14889 14889
nrow(train)
## [1] 69486
rfT <- randomForest(GC ~ ., data = train, ntree=500, norm.votes=FALSE, do.trace=10,importance=TRUE)
## ntree      OOB      1      2
##    10:  28.45% 27.39% 29.50%
##    20:  26.32% 26.46% 26.19%
##    30:  25.28% 26.19% 24.37%
##    40:  24.82% 26.09% 23.54%
##    50:  24.44% 25.94% 22.94%
##    60:  24.27% 25.88% 22.66%
##    70:  24.13% 25.90% 22.36%
##    80:  24.01% 25.84% 22.18%
##    90:  23.92% 25.71% 22.12%
##   100:  23.82% 25.71% 21.94%
##   110:  23.80% 25.73% 21.87%
##   120:  23.76% 25.65% 21.87%
##   130:  23.69% 25.62% 21.76%
##   140:  23.65% 25.64% 21.66%
##   150:  23.57% 25.59% 21.54%
##   160:  23.55% 25.57% 21.52%
##   170:  23.52% 25.56% 21.49%
##   180:  23.47% 25.60% 21.34%
##   190:  23.51% 25.68% 21.33%
##   200:  23.45% 25.66% 21.25%
##   210:  23.45% 25.64% 21.26%
##   220:  23.45% 25.64% 21.26%
##   230:  23.37% 25.64% 21.11%
##   240:  23.37% 25.62% 21.13%
##   250:  23.33% 25.52% 21.13%
##   260:  23.33% 25.53% 21.13%
##   270:  23.32% 25.54% 21.11%
##   280:  23.32% 25.57% 21.07%
##   290:  23.36% 25.58% 21.13%
##   300:  23.34% 25.62% 21.05%
##   310:  23.31% 25.57% 21.06%
##   320:  23.32% 25.61% 21.03%
##   330:  23.29% 25.56% 21.01%
##   340:  23.33% 25.61% 21.06%
##   350:  23.33% 25.62% 21.04%
##   360:  23.32% 25.56% 21.07%
##   370:  23.28% 25.53% 21.03%
##   380:  23.31% 25.53% 21.09%
##   390:  23.25% 25.47% 21.04%
##   400:  23.29% 25.52% 21.06%
##   410:  23.31% 25.53% 21.09%
##   420:  23.28% 25.51% 21.06%
##   430:  23.27% 25.57% 20.96%
##   440:  23.28% 25.52% 21.04%
##   450:  23.26% 25.52% 21.00%
##   460:  23.25% 25.52% 20.99%
##   470:  23.25% 25.48% 21.02%
##   480:  23.26% 25.52% 21.00%
##   490:  23.26% 25.49% 21.02%
##   500:  23.24% 25.52% 20.97%
rfT
## 
## Call:
##  randomForest(formula = GC ~ ., data = train, ntree = 500, norm.votes = FALSE,      do.trace = 10, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 23.24%
## Confusion matrix:
##       1     3 class.error
## 1 25878  8865   0.2551593
## 3  7284 27459   0.2096537
oob.error.data <- data.frame(
  Trees=rep(1:nrow(rfT$err.rate), times=3),
  Type=rep(c("OOB", "1", "3"), each=nrow(rfT$err.rate)),
  Error=c(rfT$err.rate[,"OOB"], 
          rfT$err.rate[,"1"], 
          rfT$err.rate[,"3"]))

ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
  geom_line(aes(color=Type))+theme_minimal()

Obtención de el número de árboles con el mínimo OOB-error

optimal_ntree <- which.min(rfT$err.rate[, "OOB"])
cat("El numero de arboles optimo es:", optimal_ntree, "\n")
## El numero de arboles optimo es: 462
set.seed(123)
MtryTune<-tuneRF(x=train[,-1],
                 y=train$GC, 
                 mtryStart=3,
                 ntreeTry=optimal_ntree, 
                 stepFactor=2,  
                 improve=0.001,
                 trace=TRUE, 
                 plot=TRUE, 
                 doBest=FALSE)
## mtry = 3  OOB error = 23.27% 
## Searching left ...
## mtry = 2     OOB error = 23.25% 
## 0.0008041569 0.001 
## Searching right ...
## mtry = 6     OOB error = 23.44% 
## -0.007422987 0.001

Mejor valor de mtry

best_mtry <- MtryTune[which.min(MtryTune[,2]),1]
cat("El mejor mtry es:", best_mtry, "\n")
## El mejor mtry es: 2

Resultados:

FinalRF <- randomForest(GC ~ ., 
                         data=train,
                         ntree=optimal_ntree, 
                         importance=TRUE,
                         mtry=best_mtry,
                         norm.votes=FALSE,
                         do.trace=10
                         ) 
## ntree      OOB      1      2
##    10:  28.42% 27.10% 29.74%
##    20:  26.09% 26.32% 25.85%
##    30:  25.23% 26.07% 24.39%
##    40:  24.61% 25.75% 23.47%
##    50:  24.51% 25.86% 23.16%
##    60:  24.29% 25.74% 22.84%
##    70:  24.11% 25.91% 22.30%
##    80:  23.97% 25.83% 22.12%
##    90:  23.85% 25.77% 21.93%
##   100:  23.79% 25.76% 21.81%
##   110:  23.73% 25.72% 21.74%
##   120:  23.68% 25.76% 21.59%
##   130:  23.60% 25.65% 21.54%
##   140:  23.65% 25.67% 21.62%
##   150:  23.66% 25.76% 21.55%
##   160:  23.54% 25.65% 21.43%
##   170:  23.52% 25.65% 21.38%
##   180:  23.50% 25.65% 21.35%
##   190:  23.46% 25.65% 21.26%
##   200:  23.46% 25.69% 21.22%
##   210:  23.42% 25.68% 21.16%
##   220:  23.44% 25.68% 21.19%
##   230:  23.44% 25.69% 21.18%
##   240:  23.39% 25.65% 21.14%
##   250:  23.36% 25.64% 21.07%
##   260:  23.36% 25.71% 21.01%
##   270:  23.31% 25.64% 20.99%
##   280:  23.37% 25.74% 21.01%
##   290:  23.39% 25.76% 21.02%
##   300:  23.36% 25.68% 21.05%
##   310:  23.36% 25.69% 21.04%
##   320:  23.36% 25.72% 20.99%
##   330:  23.38% 25.75% 21.01%
##   340:  23.37% 25.73% 21.02%
##   350:  23.41% 25.75% 21.06%
##   360:  23.37% 25.75% 20.98%
##   370:  23.38% 25.79% 20.97%
##   380:  23.35% 25.75% 20.95%
##   390:  23.32% 25.68% 20.95%
##   400:  23.29% 25.66% 20.92%
##   410:  23.35% 25.75% 20.94%
##   420:  23.30% 25.71% 20.88%
##   430:  23.35% 25.77% 20.93%
##   440:  23.33% 25.78% 20.87%
##   450:  23.31% 25.69% 20.94%
##   460:  23.37% 25.77% 20.98%
classes_test<-predict(FinalRF, test)

confusionMatrix(classes_test, test$GC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     3
##          1 10989  3068
##          3  3900 11821
##                                           
##                Accuracy : 0.766           
##                  95% CI : (0.7612, 0.7708)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.532           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7381          
##             Specificity : 0.7939          
##          Pos Pred Value : 0.7817          
##          Neg Pred Value : 0.7519          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3690          
##    Detection Prevalence : 0.4721          
##       Balanced Accuracy : 0.7660          
##                                           
##        'Positive' Class : 1               
## 

5. Conclusiones

Algoritmo Precisión
Árbol de Clasificación (Train) 0.6749
Árbol de Clasificación (Test) 0.6708
AdaBoost (Train) 0.6451
AdaBoost (Test) 0.6454
Random Forest (Train) 1
Random Forest (Test) 0,7678
¡Muchas gracias por su atención!