La finalidad de este proyecto es comparar los distintos algoritmos y como predicen la afección de cotonet en kaki.
Se cargan los 4 años de estudio en distintos objetos
library(readxl)
kaki_19 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2019")
kaki_20 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2020")
kaki_21 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2021")
kaki_22 <- read_excel("C:/Users/enric/Escritorio/BIG DATA/Trabajo/Bandas_kaki_10m_19_20_21_22.xlsx", sheet = "2022")
Se procede a eliminar y organizar los datos de los data frames para un análisis correcto
Se decide por motivos agronómicos que las fechas de estudio se estableceran entre abril y septiembre
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(Matrix)
library(scatterplot3d)
dim(kaki_19)
## [1] 3903 163
#AÑO 2019
Split <- str_split(names(kaki_19), "_",simplify = FALSE)
Fecha <- sapply(Split, function(x) ifelse(length(x) > 1, x[[2]], NA))
Lista_19 <- as.list(setNames(Fecha, names(kaki_19)))
kaki_19_1 <- rbind(kaki_19, Lista_19)
kaki_19_Int <- kaki_19_1 %>%
select(which(kaki_19_1[3904, ] >= 190401 & kaki_19_1[3904, ] <= 190930))
Kaki_19_INTERES <- kaki_19_Int[1:3903,]
kaki_19_Muestreo <- kaki_19_1 [1:3903,1:3]
kaki_19_TOT <- bind_cols( kaki_19_Muestreo,Kaki_19_INTERES)
kaki_19_TOT <- mutate_all(kaki_19_TOT, as.numeric)
kaki_19_TOT$GC<-as.factor(kaki_19_TOT$GC)
kaki_19_TOT$FieD<-as.factor(kaki_19_TOT$FieD)
kaki_19_TOT$PID<-as.factor(kaki_19_TOT$PID)
dim(kaki_19_TOT)
## [1] 3903 88
#AÑO 2020
Split_20 <- str_split(names(kaki_20), "_",simplify = FALSE)
Fecha_20 <- sapply(Split_20, function(x) ifelse(length(x) > 1, x[[2]], NA))
Lista_20 <- as.list(setNames(Fecha_20, names(kaki_20)))
kaki_20_1 <- rbind(kaki_20, Lista_20)
kaki_20_Int <- kaki_20_1 %>%
select(which(kaki_20_1[3904, ] >= 200401 & kaki_20_1[3904, ] <= 200930))
Kaki_20_INTERES <- kaki_20_Int[1:3903,]
kaki_20_Muestreo <- kaki_20_1 [1:3903,1:3]
kaki_20_TOT <- bind_cols( kaki_20_Muestreo, Kaki_20_INTERES)
kaki_20_TOT <- mutate_all(kaki_20_TOT, as.numeric)
kaki_20_TOT$GC<-as.factor(kaki_20_TOT$GC)
kaki_20_TOT$FieD<-as.factor(kaki_20_TOT$FieD)
kaki_20_TOT$PID<-as.factor(kaki_20_TOT$PID)
#AÑO 2021
Split_21 <- str_split(names(kaki_21), "_",simplify = FALSE)
Fecha_21 <- sapply(Split_21, function(x) ifelse(length(x) > 1, x[[2]], NA))
Lista_21 <- as.list(setNames(Fecha_21, names(kaki_21)))
kaki_21_1 <- rbind(kaki_21, Lista_21)
kaki_21_Int <- kaki_21_1 %>%
select(which(kaki_21_1[3904, ] >= 210401 & kaki_21_1[3904, ] <= 210930))
Kaki_21_INTERES <- kaki_21_Int[1:3903,]
kaki_21_Muestreo <- kaki_21_1 [1:3903,1:3]
kaki_21_TOT <- bind_cols( kaki_21_Muestreo, Kaki_21_INTERES)
kaki_21_TOT <- mutate_all(kaki_21_TOT, as.numeric)
kaki_21_TOT$GC<-as.factor(kaki_21_TOT$GC)
kaki_21_TOT$FieD<-as.factor(kaki_21_TOT$FieD)
kaki_21_TOT$PID<-as.factor(kaki_21_TOT$PID)
#AÑO 2022
Split_22 <- str_split(names(kaki_22), "_",simplify = FALSE)
Fecha_22 <- sapply(Split_22, function(x) ifelse(length(x) > 1, x[[2]], NA))
Lista_22 <- as.list(setNames(Fecha_22, names(kaki_22)))
kaki_22_1 <- rbind(kaki_22, Lista_22)
kaki_22_Int <- kaki_22_1 %>%
select(which(kaki_22_1[3904, ] >= 220401 & kaki_22_1[3904, ] <= 220930))
Kaki_22_INTERES <- kaki_22_Int[1:3903,]
kaki_22_Muestreo <- kaki_22_1 [1:3903,1:3]
kaki_22_TOT <- bind_cols( kaki_22_Muestreo, Kaki_22_INTERES)
kaki_22_TOT <- mutate_all(kaki_22_TOT, as.numeric)
kaki_22_TOT$GC<-as.factor(kaki_22_TOT$GC)
kaki_22_TOT$FieD<-as.factor(kaki_22_TOT$FieD)
kaki_22_TOT$PID<-as.factor(kaki_22_TOT$PID)
Se necesita la fecha como variable por lo que debemos derretir la matriz con el comando melt del paquete reshape2
library(reshape2)
#MELT AÑO 2019
kaki_19_B2 <- kaki_19_TOT [,grep("^B02",colnames(kaki_19_TOT))]
kaki_19_B3 <- kaki_19_TOT [,grep("^B03",colnames(kaki_19_TOT))]
kaki_19_B4 <- kaki_19_TOT [,grep("^B04",colnames(kaki_19_TOT))]
kaki_19_B8 <- kaki_19_TOT [,grep("^B08",colnames(kaki_19_TOT))]
kaki_19_NDVI <- kaki_19_TOT [,grep("^NDVI",colnames(kaki_19_TOT))]
kaki_19_B2_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B2)
kaki_19_B3_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B3)
kaki_19_B4_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B4)
kaki_19_B8_2 <- bind_cols(kaki_19_Muestreo,kaki_19_B8)
kaki_19_NDVI_2 <- bind_cols(kaki_19_Muestreo,kaki_19_NDVI)
MeltB2 <- melt(kaki_19_B2_2)
## Using GC, FieD, PID as id variables
colnames(MeltB2)[5] <- "B02"
colnames(MeltB2)[4] <- "Fecha"
MeltB2$Fecha <- sub("B02_", "", MeltB2$Fecha)
MeltB3 <- melt(kaki_19_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3)[5] <- "B03"
colnames(MeltB3)[4] <- "Fecha"
MeltB3$Fecha <- sub("B03_", "", MeltB3$Fecha)
MeltB4 <- melt(kaki_19_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4)[5] <- "B04"
colnames(MeltB4)[4] <- "Fecha"
MeltB4$Fecha <- sub("B04_", "", MeltB4$Fecha)
MeltB8 <- melt(kaki_19_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8)[5] <- "B08"
colnames(MeltB8)[4] <- "Fecha"
MeltB8$Fecha <- sub("B08_", "", MeltB8$Fecha)
MeltNDVI <- melt(kaki_19_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI)[5] <- "NDVI"
colnames(MeltNDVI)[4] <- "Fecha"
MeltNDVI$Fecha <- sub("NDVI_", "", MeltNDVI$Fecha)
kaki_def19 <- dplyr::bind_cols(MeltB2, MeltB3[, -c(1:4)], MeltB4[, -c(1:4)], MeltB8[, -c(1:4)], MeltNDVI[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def19)[6] <- "B03"
colnames(kaki_def19)[7] <- "B04"
colnames(kaki_def19)[8] <- "B08"
colnames(kaki_def19)[9] <- "NDVI"
kaki_def19$GC<-as.factor(kaki_def19$GC)
kaki_def19$FieD<-as.factor(kaki_def19$FieD)
kaki_def19$PID<-as.factor(kaki_def19$PID)
dim(kaki_def19)
## [1] 66351 9
#MELT AÑO 2020
kaki_20_B2 <- kaki_20_TOT [,grep("^B02",colnames(kaki_20_TOT))]
kaki_20_B3 <- kaki_20_TOT [,grep("^B03",colnames(kaki_20_TOT))]
kaki_20_B4 <- kaki_20_TOT [,grep("^B04",colnames(kaki_20_TOT))]
kaki_20_B8 <- kaki_20_TOT [,grep("^B08",colnames(kaki_20_TOT))]
kaki_20_NDVI <- kaki_20_TOT [,grep("^NDVI",colnames(kaki_20_TOT))]
kaki_20_B2_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B2)
kaki_20_B3_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B3)
kaki_20_B4_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B4)
kaki_20_B8_2 <- bind_cols(kaki_20_Muestreo,kaki_20_B8)
kaki_20_NDVI_2 <- bind_cols(kaki_20_Muestreo,kaki_20_NDVI)
MeltB2_20 <- melt(kaki_20_B2_2,)
## Using GC, FieD, PID as id variables
colnames(MeltB2_20)[5] <- "B02"
colnames(MeltB2_20)[4] <- "Fecha"
MeltB2_20$Fecha <- sub("B02_", "", MeltB2_20$Fecha)
MeltB3_20 <- melt(kaki_20_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3_20)[5] <- "B03"
colnames(MeltB3_20)[4] <- "Fecha"
MeltB3_20$Fecha <- sub("B03_", "", MeltB3_20$Fecha)
MeltB4_20 <- melt(kaki_20_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4_20)[5] <- "B04"
colnames(MeltB4_20)[4] <- "Fecha"
MeltB4_20$Fecha <- sub("B04_", "", MeltB4_20$Fecha)
MeltB8_20 <- melt(kaki_20_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8_20)[5] <- "B08"
colnames(MeltB8_20)[4] <- "Fecha"
MeltB8_20$Fecha <- sub("B08_", "", MeltB8_20$Fecha)
MeltNDVI_20 <- melt(kaki_20_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI_20)[5] <- "NDVI"
colnames(MeltNDVI_20)[4] <- "Fecha"
MeltNDVI_20$Fecha <- sub("NDVI_", "", MeltNDVI_20$Fecha)
kaki_def20 <- dplyr::bind_cols(MeltB2_20, MeltB3_20[, -c(1:4)], MeltB4_20[, -c(1:4)], MeltB8_20[, -c(1:4)], MeltNDVI_20[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def20)[6] <- "B03"
colnames(kaki_def20)[7] <- "B04"
colnames(kaki_def20)[8] <- "B08"
colnames(kaki_def20)[9] <- "NDVI"
kaki_def20$GC<-as.factor(kaki_def20$GC)
kaki_def20$FieD<-as.factor(kaki_def20$FieD)
kaki_def20$PID<-as.factor(kaki_def20$PID)
#MELT AÑO 2021
kaki_21_B2 <- kaki_21_TOT [,grep("^B02",colnames(kaki_21_TOT))]
kaki_21_B3 <- kaki_21_TOT [,grep("^B03",colnames(kaki_21_TOT))]
kaki_21_B4 <- kaki_21_TOT [,grep("^B04",colnames(kaki_21_TOT))]
kaki_21_B8 <- kaki_21_TOT [,grep("^B08",colnames(kaki_21_TOT))]
kaki_21_NDVI <- kaki_21_TOT [,grep("^NDVI",colnames(kaki_21_TOT))]
kaki_21_B2_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B2)
kaki_21_B3_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B3)
kaki_21_B4_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B4)
kaki_21_B8_2 <- bind_cols(kaki_21_Muestreo,kaki_21_B8)
kaki_21_NDVI_2 <- bind_cols(kaki_21_Muestreo,kaki_21_NDVI)
MeltB2_21 <- melt(kaki_21_B2_2)
## Using GC, FieD, PID as id variables
colnames(MeltB2_21)[5] <- "B02"
colnames(MeltB2_21)[4] <- "Fecha"
MeltB2_21$Fecha <- sub("B02_", "", MeltB2_21$Fecha)
MeltB3_21 <- melt(kaki_21_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3_21)[5] <- "B03"
colnames(MeltB3_21)[4] <- "Fecha"
MeltB3_21$Fecha <- sub("B03_", "", MeltB3_21$Fecha)
MeltB4_21 <- melt(kaki_21_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4_21)[5] <- "B04"
colnames(MeltB4_21)[4] <- "Fecha"
MeltB4_21$Fecha <- sub("B04_", "", MeltB4_21$Fecha)
MeltB8_21 <- melt(kaki_21_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8_21)[5] <- "B08"
colnames(MeltB8_21)[4] <- "Fecha"
MeltB8_21$Fecha <- sub("B08_", "", MeltB8_21$Fecha)
MeltNDVI_21 <- melt(kaki_21_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI_21)[5] <- "NDVI"
colnames(MeltNDVI_21)[4] <- "Fecha"
MeltNDVI_21$Fecha <- sub("NDVI_", "", MeltNDVI_21$Fecha)
kaki_def21 <- dplyr::bind_cols(MeltB2_21, MeltB3_21[, -c(1:4)], MeltB4_21[, -c(1:4)], MeltB8_21[, -c(1:4)], MeltNDVI_21[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def21)[6] <- "B03"
colnames(kaki_def21)[7] <- "B04"
colnames(kaki_def21)[8] <- "B08"
colnames(kaki_def21)[9] <- "NDVI"
kaki_def21$GC<-as.factor(kaki_def21$GC)
kaki_def21$FieD<-as.factor(kaki_def21$FieD)
kaki_def21$PID<-as.factor(kaki_def21$PID)
#MELT AÑO 2022
kaki_22_B2 <- kaki_22_TOT [,grep("^B02",colnames(kaki_22_TOT))]
kaki_22_B3 <- kaki_22_TOT [,grep("^B03",colnames(kaki_22_TOT))]
kaki_22_B4 <- kaki_22_TOT [,grep("^B04",colnames(kaki_22_TOT))]
kaki_22_B8 <- kaki_22_TOT [,grep("^B08",colnames(kaki_22_TOT))]
kaki_22_NDVI <- kaki_22_TOT [,grep("^NDVI",colnames(kaki_22_TOT))]
kaki_22_B2_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B2)
kaki_22_B3_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B3)
kaki_22_B4_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B4)
kaki_22_B8_2 <- bind_cols(kaki_22_Muestreo,kaki_22_B8)
kaki_22_NDVI_2 <- bind_cols(kaki_22_Muestreo,kaki_22_NDVI)
MeltB2_22 <- melt(kaki_22_B2_2)
## Using GC, FieD, PID as id variables
colnames(MeltB2_22)[5] <- "B02"
colnames(MeltB2_22)[4] <- "Fecha"
MeltB2_22$Fecha <- sub("B02_", "", MeltB2_22$Fecha)
MeltB3_22 <- melt(kaki_22_B3_2)
## Using GC, FieD, PID as id variables
colnames(MeltB3_22)[5] <- "B03"
colnames(MeltB3_22)[4] <- "Fecha"
MeltB3_22$Fecha <- sub("B03_", "", MeltB3_22$Fecha)
MeltB4_22 <- melt(kaki_22_B4_2)
## Using GC, FieD, PID as id variables
colnames(MeltB4_22)[5] <- "B04"
colnames(MeltB4_22)[4] <- "Fecha"
MeltB4_22$Fecha <- sub("B04_", "", MeltB4_22$Fecha)
MeltB8_22 <- melt(kaki_22_B8_2)
## Using GC, FieD, PID as id variables
colnames(MeltB8_22)[5] <- "B08"
colnames(MeltB8_22)[4] <- "Fecha"
MeltB8_22$Fecha <- sub("B08_", "", MeltB8_22$Fecha)
MeltNDVI_22 <- melt(kaki_22_NDVI_2)
## Using GC, FieD, PID as id variables
colnames(MeltNDVI_22)[5] <- "NDVI"
colnames(MeltNDVI_22)[4] <- "Fecha"
MeltNDVI_22$Fecha <- sub("NDVI_", "", MeltNDVI_22$Fecha)
kaki_def22 <- dplyr::bind_cols(MeltB2_22, MeltB3_22[, -c(1:4)], MeltB4_22[, -c(1:4)], MeltB8_22[, -c(1:4)], MeltNDVI_22[, -c(1:4)])
## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
colnames(kaki_def22)[6] <- "B03"
colnames(kaki_def22)[7] <- "B04"
colnames(kaki_def22)[8] <- "B08"
colnames(kaki_def22)[9] <- "NDVI"
kaki_def22$GC<-as.factor(kaki_def22$GC)
kaki_def22$FieD<-as.factor(kaki_def22$FieD)
kaki_def22$PID<-as.factor(kaki_def22$PID)
En este paso se obtiene la base final con todos los años y cada observación con el mes
kakiDEF <- rbind(kaki_def19, kaki_def20, kaki_def21, kaki_def22)
kakiDEF$MES <- str_sub(kakiDEF$Fecha, start = 3, end = 4)
kakiDEF$MES<-as.factor(kakiDEF$MES)
kakiDEF$PID<-as.character(kakiDEF$PID)
kakiDEF$FieD<-as.character(kakiDEF$FieD)
kakiDEF_2 <- kakiDEF[,-c(2,3)]
str(kakiDEF_2)
## 'data.frame': 171732 obs. of 8 variables:
## $ GC : Factor w/ 2 levels "1","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Fecha: chr "190411" "190411" "190411" "190411" ...
## $ B02 : num 655 658 630 640 590 616 727 712 680 648 ...
## $ B03 : num 1062 1050 995 1024 991 ...
## $ B04 : num 1288 1324 1276 1266 1120 ...
## $ B08 : num 3130 3024 2958 2890 2996 ...
## $ NDVI : num 0.417 0.391 0.397 0.391 0.456 ...
## $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
table(kakiDEF$GC, kakiDEF$MES)
##
## 04 05 06 07 08 09
## 1 11100 13875 22200 36075 27750 11100
## 3 4512 5640 9024 14664 11280 4512
Como en la base kakiDEF hay muchos casos de GC = 1, los algoritmos no van a clasificar bien por lo que debemos balancear la base con los mismos casos de GC por MES
Comprobación y eliminación de datos duplicados
Mes4_1 <- kakiDEF[which(kakiDEF$MES=="04" & kakiDEF$GC == "1"),]
index4 <- sample(1:nrow(Mes4_1), size=4512, replace = F)
Mes4_1Red <- Mes4_1[index4,]
Mes5_1 <- kakiDEF[which(kakiDEF$MES=="05" & kakiDEF$GC == "1"),]
index5 <- as.numeric(sample(1:nrow(Mes5_1), size=5640, replace = F))
Mes5_1Red <- Mes5_1[index5,]
Mes6_1 <- kakiDEF[which(kakiDEF$MES=="06" & kakiDEF$GC == "1"),]
index6 <- as.numeric(sample(1:nrow(Mes6_1), size=9024, replace = F))
Mes6_1Red <- Mes6_1[index6,]
Mes7_1 <- kakiDEF[which(kakiDEF$MES=="07" & kakiDEF$GC == "1"),]
index7 <- as.numeric(sample(1:nrow(Mes7_1), size=14664, replace = F))
Mes7_1Red <- Mes7_1[index7,]
Mes8_1 <- kakiDEF[which(kakiDEF$MES=="08" & kakiDEF$GC == "1"),]
index8 <- as.numeric(sample(1:nrow(Mes8_1), size=11280, replace = F))
Mes8_1Red <- Mes8_1[index8,]
Mes9_1 <- kakiDEF[which(kakiDEF$MES=="09" & kakiDEF$GC == "1"),]
index9 <- as.numeric(sample(1:nrow(Mes9_1), size=4512, replace = F))
Mes9_1Red <- Mes9_1[index9,]
gc3 <- kakiDEF[which(kakiDEF$GC == "3"),]
kakiDEFbalanceV2 <- rbind(gc3,Mes4_1Red, Mes5_1Red, Mes6_1Red, Mes7_1Red, Mes8_1Red, Mes9_1Red)
kakiDEFbalance <- kakiDEFbalanceV2[,-c(2,3,4)]
dim(kakiDEFbalance)
## [1] 99264 7
table(kakiDEFbalance$GC, kakiDEFbalance$MES)
##
## 04 05 06 07 08 09
## 1 4512 5640 9024 14664 11280 4512
## 3 4512 5640 9024 14664 11280 4512
duplicated_rows <- kakiDEFbalance[duplicated(kakiDEFbalance), ]
duplicated_rows_sin_balance <- kakiDEF[duplicated(kakiDEF), ]
El algoritmo elegido para la clasificación no supervisada es un PCA
library(Matrix)
library(scatterplot3d)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(dplyr)
library(factoextra)
library(cluster)
library(simstudy)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ car::recode() masks dplyr::recode()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::some() masks car::some()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:data.table':
##
## first, last
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
library(corrr)
library(dbscan)
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(FactoMineR)
A=kakiDEFbalance[,-c(1,7)]
table(kakiDEFbalance$GC)
##
## 1 3
## 49632 49632
A <- mutate_all(A, as.numeric)
cor(A)
## B02 B03 B04 B08 NDVI
## B02 1.0000000 0.9777877 0.9050390 0.4520237 -0.6975540
## B03 0.9777877 1.0000000 0.9384456 0.4989298 -0.7196764
## B04 0.9050390 0.9384456 1.0000000 0.3761287 -0.8651939
## B08 0.4520237 0.4989298 0.3761287 1.0000000 0.1027268
## NDVI -0.6975540 -0.7196764 -0.8651939 0.1027268 1.0000000
duplicated_rows_A <- A[duplicated(A), ]
pca<-prcomp(A,scale=T)
eigenvalues<-get_eig(pca)
eigenvalues
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.71114035 74.2228069 74.22281
## Dim.2 1.11002152 22.2004305 96.42324
## Dim.3 0.15280323 3.0560647 99.47930
## Dim.4 0.01484578 0.2969155 99.77622
## Dim.5 0.01118912 0.2237824 100.00000
fviz_eig(pca,choice = "variance", addlabels = TRUE)
Autovectores=pca$rotation
head(Autovectores)
## PC1 PC2 PC3 PC4 PC5
## B02 0.5006346 -0.06309185 -0.6278023 -0.59265380 -0.003191517
## B03 0.5107112 -0.08545273 -0.3096415 0.76964238 -0.208912933
## B04 0.5095898 0.09064235 0.3660000 0.02894989 0.772857129
## B08 0.2323077 -0.83451204 0.4101063 -0.14803723 -0.243968428
## NDVI -0.4181990 -0.53300175 -0.4558983 0.18346379 0.547280974
result.var <- get_pca_var(pca)
Loadings=result.var$coord
result.var$cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## B02 0.9644386 -0.06647203 -0.2454082 -0.072210861 -0.0003375943
## B03 0.9838505 -0.09003091 -0.1210390 0.093775723 -0.0220985251
## B04 0.9816900 0.09549857 0.1430696 0.003527349 0.0817517730
## B08 0.4475250 -0.87922152 0.1603108 -0.018037336 -0.0258066475
## NDVI -0.8056320 -0.56155763 -0.1782109 0.022353823 0.0578906350
fviz_pca_var(pca,geom=c("arrow", "text"),col.var="contrib",gradient.cols=c("#00AFBB","#E7B800","#FC4E07"),repel = TRUE)
Contribuciones=result.var$contrib
#Visualizar la contribución (%) de cada variable a las componentes obtenidas
a=fviz_contrib(pca, choice = "var", axes = 1, top = 18)+
theme(axis.text.x = element_text(angle=90))
b=fviz_contrib(pca, choice = "var", axes = 2, top = 18)+
theme(axis.text.x = element_text(angle=90))
grid.arrange(a,b, ncol=1, top='Contribution of the variables to the first two PCs')
table(kakiDEFbalance$GC)
##
## 1 3
## 49632 49632
fviz_pca_ind(pca,geom=c("point"),alpha.ind = 0.1, xlab="PC 1", ylab ="PC 2", col.ind=factor(kakiDEFbalance$GC))
En conclusión, el PCA no nos sirve ya que el estado de GC 1 y 3 se ven influidas por las mismas variables, como se puede observar en el gráfico anterior
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(modelr)
library(data.table)
library(dplyr)
library(ggplot2)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(lattice)
str(kakiDEF)
## 'data.frame': 171732 obs. of 10 variables:
## $ GC : Factor w/ 2 levels "1","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ FieD : chr "55" "55" "55" "55" ...
## $ PID : chr "1239" "1240" "1233" "1234" ...
## $ Fecha: chr "190411" "190411" "190411" "190411" ...
## $ B02 : num 655 658 630 640 590 616 727 712 680 648 ...
## $ B03 : num 1062 1050 995 1024 991 ...
## $ B04 : num 1288 1324 1276 1266 1120 ...
## $ B08 : num 3130 3024 2958 2890 2996 ...
## $ NDVI : num 0.417 0.391 0.397 0.391 0.456 ...
## $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
set.seed(30)
index <- createDataPartition(kakiDEFbalance$GC, p = 0.7, list = FALSE)
train_kakiDEFbalance <- kakiDEFbalance[index, ]
test_kakiDEFbalance<- kakiDEFbalance[-index, ]
nrow(train_kakiDEFbalance)
## [1] 69486
head(train_kakiDEFbalance)
## GC B02 B03 B04 B08 NDVI MES
## 2776 3 561 1008 1288 3392 0.4495726 04
## 2777 3 774 1278 1582 3588 0.3880077 04
## 2778 3 741 1220 1406 3544 0.4319192 04
## 2779 3 597 1064 1198 3474 0.4871575 04
## 2780 3 471 1036 1210 3398 0.4748264 04
## 2781 3 499 949 1168 3326 0.4801958 04
nrow(test_kakiDEFbalance)
## [1] 29778
Optimización de CP, grado de complejidad del árbol
CTree <- rpart(formula = GC ~ .,
data = train_kakiDEFbalance,
method = "class",
control = rpart.control(minsplit = 200,
minbucket = 100,
cp = 0.005,
xval = 10),
parms = list(split = "gini"))
printcp(CTree)
##
## Classification tree:
## rpart(formula = GC ~ ., data = train_kakiDEFbalance, method = "class",
## parms = list(split = "gini"), control = rpart.control(minsplit = 200,
## minbucket = 100, cp = 0.005, xval = 10))
##
## Variables actually used in tree construction:
## [1] B02 B03 B04 B08 MES NDVI
##
## Root node error: 34743/69486 = 0.5
##
## n= 69486
##
## CP nsplit rel error xerror xstd
## 1 0.1651268 0 1.00000 1.00662 0.0037935
## 2 0.1044527 1 0.83487 0.83683 0.0037428
## 3 0.0105345 2 0.73042 0.73405 0.0036570
## 4 0.0092393 3 0.71989 0.69623 0.0036143
## 5 0.0056702 8 0.66718 0.68022 0.0035944
## 6 0.0050000 9 0.66151 0.67697 0.0035902
plotcp(CTree)
rpart.plot(CTree)
CTree_2 <- rpart(formula = GC ~ .,
data = train_kakiDEFbalance,
method = "class",
control = rpart.control(minsplit = 200,
minbucket = 100,
cp = 0.005,
xval = 0),
parms = list(split = "gini"))
rpart.plot(CTree_2)
Representación del peso de las variables
varImp(CTree_2)
## Overall
## B02 4018.3774
## B03 4117.1170
## B04 3649.3177
## B08 3763.1234
## MES 204.6096
## NDVI 2551.6051
str(train_kakiDEFbalance)
## 'data.frame': 69486 obs. of 7 variables:
## $ GC : Factor w/ 2 levels "1","3": 2 2 2 2 2 2 2 2 2 2 ...
## $ B02 : num 561 774 741 597 471 499 453 907 728 898 ...
## $ B03 : num 1008 1278 1220 1064 1036 ...
## $ B04 : num 1288 1582 1406 1198 1210 ...
## $ B08 : num 3392 3588 3544 3474 3398 ...
## $ NDVI: num 0.45 0.388 0.432 0.487 0.475 ...
## $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
classes_train <- predict(CTree_2, train_kakiDEFbalance)
str(classes_train)
## num [1:69486, 1:2] 0.367 0.367 0.367 0.367 0.367 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:69486] "2776" "2777" "2778" "2779" ...
## ..$ : chr [1:2] "1" "3"
classes_train<-as.data.frame(classes_train)
colnames(classes_train) <- c("NOAFECTADO","AFECTADO")
classes_test <- predict(CTree_2, test_kakiDEFbalance)
str(classes_test)
## num [1:29778, 1:2] 0.367 0.44 0.44 0.367 0.367 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:29778] "2782" "2785" "2786" "2796" ...
## ..$ : chr [1:2] "1" "3"
classes_test<-as.data.frame(classes_test)
colnames(classes_test) <- c("NOAFECTADO","AFECTADO")
Resultados
GCpred <- as.factor(ifelse(classes_train$NOAFECTADO >= 0.5, "1", "3"))
confusionMatrix(GCpred, train_kakiDEFbalance$GC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 3
## 1 18242 6482
## 3 16501 28261
##
## Accuracy : 0.6692
## 95% CI : (0.6657, 0.6727)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3385
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5251
## Specificity : 0.8134
## Pos Pred Value : 0.7378
## Neg Pred Value : 0.6314
## Prevalence : 0.5000
## Detection Rate : 0.2625
## Detection Prevalence : 0.3558
## Balanced Accuracy : 0.6692
##
## 'Positive' Class : 1
##
GCpredTEST<- as.factor(ifelse(classes_test$NOAFECTADO >= 0.5, "1", "3"))
confusionMatrix(GCpredTEST, test_kakiDEFbalance$GC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 3
## 1 7851 2748
## 3 7038 12141
##
## Accuracy : 0.6714
## 95% CI : (0.666, 0.6767)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3427
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5273
## Specificity : 0.8154
## Pos Pred Value : 0.7407
## Neg Pred Value : 0.6330
## Prevalence : 0.5000
## Detection Rate : 0.2637
## Detection Prevalence : 0.3559
## Balanced Accuracy : 0.6714
##
## 'Positive' Class : 1
##
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(caret)
library(tree)
library(ada)
library(rpart.plot)
library(rpart.plot)
library(ggplot2)
library(lattice)
library(doParallel)
library(iterators)
library(parallel)
library(rpart)
str(kakiDEF)
## 'data.frame': 171732 obs. of 10 variables:
## $ GC : Factor w/ 2 levels "1","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ FieD : chr "55" "55" "55" "55" ...
## $ PID : chr "1239" "1240" "1233" "1234" ...
## $ Fecha: chr "190411" "190411" "190411" "190411" ...
## $ B02 : num 655 658 630 640 590 616 727 712 680 648 ...
## $ B03 : num 1062 1050 995 1024 991 ...
## $ B04 : num 1288 1324 1276 1266 1120 ...
## $ B08 : num 3130 3024 2958 2890 2996 ...
## $ NDVI : num 0.417 0.391 0.397 0.391 0.456 ...
## $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
kakiDEFsolo19 <- kakiDEF[substr(kakiDEF$Fecha,1,2)=="19",]
kakiDEFbalance$GC <- as.factor(kakiDEFbalance$GC)
kakiDEFbalance$B02 <- as.numeric(kakiDEFbalance$B02)
kakiDEFbalance$B03 <- as.numeric(kakiDEFbalance$B03)
kakiDEFbalance$B04 <- as.numeric(kakiDEFbalance$B04)
kakiDEFbalance$B08 <- as.numeric(kakiDEFbalance$B08)
kakiDEFbalance$NDVI <- as.numeric(kakiDEFbalance$NDVI)
kakiDEFbalance$MES <- as.factor(kakiDEFbalance$MES)
set.seed(14)
index <- createDataPartition(kakiDEFbalance$GC, p = 0.70, list = FALSE)
train <- kakiDEFbalance[index, ]
test <- kakiDEFbalance[-index, ]
nrow(train)
## [1] 69486
nrow(test)
## [1] 29778
tuneGrid <- expand.grid(
iter = c(5, 10, 15),
maxdepth = c(1, 2),
nu = c(0.01, 0.001) )
ctrl <- trainControl(method = "cv", number = 5) # Define las caracteristicasd de la validación cruzada
adaboost_model <- train(GC~ ., data = train, method = "ada", tuneGrid = tuneGrid, trControl = ctrl)
adaboost_model
## Boosted Classification Trees
##
## 69486 samples
## 6 predictor
## 2 classes: '1', '3'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 55588, 55588, 55589, 55590, 55589
## Resampling results across tuning parameters:
##
## nu maxdepth iter Accuracy Kappa
## 0.001 1 5 0.5808794 0.1617582
## 0.001 1 10 0.5662005 0.1324044
## 0.001 1 15 0.5726911 0.1453820
## 0.001 2 5 0.6355811 0.2711618
## 0.001 2 10 0.6364446 0.2728885
## 0.001 2 15 0.6367180 0.2734355
## 0.010 1 5 0.5673663 0.1347288
## 0.010 1 10 0.5608905 0.1217773
## 0.010 1 15 0.5594081 0.1188123
## 0.010 2 5 0.6356098 0.2712190
## 0.010 2 10 0.6349046 0.2698086
## 0.010 2 15 0.6341994 0.2683981
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were iter = 15, maxdepth = 2 and nu = 0.001.
optimal_params<-adaboost_model[["bestTune"]]
optimal_params # Se extraen los hiperparametros óptimos
## iter maxdepth nu
## 6 15 2 0.001
Resultados:
final_adaboost_model <- ada(
GC~ .,
data = train,
iter = 5,
nu = 0.001, #Learning rate (Cantidad de errores cometidos por el modelo que se deben corregir)
control = rpart.control(maxdepth = 2))
predictions_train <- predict(final_adaboost_model, newdata = train)
confusion_matrix_train <- confusionMatrix(predictions_train, train$GC)
print(confusion_matrix_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 3
## 1 12556 3471
## 3 22187 31272
##
## Accuracy : 0.6307
## 95% CI : (0.6271, 0.6343)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2615
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3614
## Specificity : 0.9001
## Pos Pred Value : 0.7834
## Neg Pred Value : 0.5850
## Prevalence : 0.5000
## Detection Rate : 0.1807
## Detection Prevalence : 0.2307
## Balanced Accuracy : 0.6307
##
## 'Positive' Class : 1
##
predictions <- predict(final_adaboost_model, newdata = test)
confusion_matrix <- confusionMatrix(predictions, test$GC)
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 3
## 1 5512 1500
## 3 9377 13389
##
## Accuracy : 0.6347
## 95% CI : (0.6292, 0.6402)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2695
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3702
## Specificity : 0.8993
## Pos Pred Value : 0.7861
## Neg Pred Value : 0.5881
## Prevalence : 0.5000
## Detection Rate : 0.1851
## Detection Prevalence : 0.2355
## Balanced Accuracy : 0.6347
##
## 'Positive' Class : 1
##
library(randomForest)
library (caret)
library(adabag)
library(tree)
set.seed(33)
index <- createDataPartition(kakiDEFbalance$GC, p = 0.70, list = FALSE)
train <- kakiDEFbalance[index, ]
str(train)
## 'data.frame': 69486 obs. of 7 variables:
## $ GC : Factor w/ 2 levels "1","3": 2 2 2 2 2 2 2 2 2 2 ...
## $ B02 : num 741 597 471 499 464 453 907 846 802 898 ...
## $ B03 : num 1220 1064 1036 949 925 ...
## $ B04 : num 1406 1198 1210 1168 1064 ...
## $ B08 : num 3544 3474 3398 3326 3242 ...
## $ NDVI: num 0.432 0.487 0.475 0.48 0.506 ...
## $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
table(train$GC)
##
## 1 3
## 34743 34743
test <- kakiDEFbalance[-index, ]
str(test)
## 'data.frame': 29778 obs. of 7 variables:
## $ GC : Factor w/ 2 levels "1","3": 2 2 2 2 2 2 2 2 2 2 ...
## $ B02 : num 561 774 728 756 613 546 529 425 753 711 ...
## $ B03 : num 1008 1278 1236 1246 1110 ...
## $ B04 : num 1288 1582 1460 1858 1516 ...
## $ B08 : num 3392 3588 3602 3520 3494 ...
## $ NDVI: num 0.45 0.388 0.423 0.309 0.395 ...
## $ MES : Factor w/ 6 levels "04","05","06",..: 1 1 1 1 1 1 1 1 1 1 ...
table(test$GC)
##
## 1 3
## 14889 14889
nrow(train)
## [1] 69486
rfT <- randomForest(GC ~ ., data = train, ntree=500, norm.votes=FALSE, do.trace=10,importance=TRUE)
## ntree OOB 1 2
## 10: 28.45% 27.39% 29.50%
## 20: 26.32% 26.46% 26.19%
## 30: 25.28% 26.19% 24.37%
## 40: 24.82% 26.09% 23.54%
## 50: 24.44% 25.94% 22.94%
## 60: 24.27% 25.88% 22.66%
## 70: 24.13% 25.90% 22.36%
## 80: 24.01% 25.84% 22.18%
## 90: 23.92% 25.71% 22.12%
## 100: 23.82% 25.71% 21.94%
## 110: 23.80% 25.73% 21.87%
## 120: 23.76% 25.65% 21.87%
## 130: 23.69% 25.62% 21.76%
## 140: 23.65% 25.64% 21.66%
## 150: 23.57% 25.59% 21.54%
## 160: 23.55% 25.57% 21.52%
## 170: 23.52% 25.56% 21.49%
## 180: 23.47% 25.60% 21.34%
## 190: 23.51% 25.68% 21.33%
## 200: 23.45% 25.66% 21.25%
## 210: 23.45% 25.64% 21.26%
## 220: 23.45% 25.64% 21.26%
## 230: 23.37% 25.64% 21.11%
## 240: 23.37% 25.62% 21.13%
## 250: 23.33% 25.52% 21.13%
## 260: 23.33% 25.53% 21.13%
## 270: 23.32% 25.54% 21.11%
## 280: 23.32% 25.57% 21.07%
## 290: 23.36% 25.58% 21.13%
## 300: 23.34% 25.62% 21.05%
## 310: 23.31% 25.57% 21.06%
## 320: 23.32% 25.61% 21.03%
## 330: 23.29% 25.56% 21.01%
## 340: 23.33% 25.61% 21.06%
## 350: 23.33% 25.62% 21.04%
## 360: 23.32% 25.56% 21.07%
## 370: 23.28% 25.53% 21.03%
## 380: 23.31% 25.53% 21.09%
## 390: 23.25% 25.47% 21.04%
## 400: 23.29% 25.52% 21.06%
## 410: 23.31% 25.53% 21.09%
## 420: 23.28% 25.51% 21.06%
## 430: 23.27% 25.57% 20.96%
## 440: 23.28% 25.52% 21.04%
## 450: 23.26% 25.52% 21.00%
## 460: 23.25% 25.52% 20.99%
## 470: 23.25% 25.48% 21.02%
## 480: 23.26% 25.52% 21.00%
## 490: 23.26% 25.49% 21.02%
## 500: 23.24% 25.52% 20.97%
rfT
##
## Call:
## randomForest(formula = GC ~ ., data = train, ntree = 500, norm.votes = FALSE, do.trace = 10, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.24%
## Confusion matrix:
## 1 3 class.error
## 1 25878 8865 0.2551593
## 3 7284 27459 0.2096537
oob.error.data <- data.frame(
Trees=rep(1:nrow(rfT$err.rate), times=3),
Type=rep(c("OOB", "1", "3"), each=nrow(rfT$err.rate)),
Error=c(rfT$err.rate[,"OOB"],
rfT$err.rate[,"1"],
rfT$err.rate[,"3"]))
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type))+theme_minimal()
Obtención de el número de árboles con el mínimo OOB-error
optimal_ntree <- which.min(rfT$err.rate[, "OOB"])
cat("El numero de arboles optimo es:", optimal_ntree, "\n")
## El numero de arboles optimo es: 462
set.seed(123)
MtryTune<-tuneRF(x=train[,-1],
y=train$GC,
mtryStart=3,
ntreeTry=optimal_ntree,
stepFactor=2,
improve=0.001,
trace=TRUE,
plot=TRUE,
doBest=FALSE)
## mtry = 3 OOB error = 23.27%
## Searching left ...
## mtry = 2 OOB error = 23.25%
## 0.0008041569 0.001
## Searching right ...
## mtry = 6 OOB error = 23.44%
## -0.007422987 0.001
Mejor valor de mtry
best_mtry <- MtryTune[which.min(MtryTune[,2]),1]
cat("El mejor mtry es:", best_mtry, "\n")
## El mejor mtry es: 2
Resultados:
FinalRF <- randomForest(GC ~ .,
data=train,
ntree=optimal_ntree,
importance=TRUE,
mtry=best_mtry,
norm.votes=FALSE,
do.trace=10
)
## ntree OOB 1 2
## 10: 28.42% 27.10% 29.74%
## 20: 26.09% 26.32% 25.85%
## 30: 25.23% 26.07% 24.39%
## 40: 24.61% 25.75% 23.47%
## 50: 24.51% 25.86% 23.16%
## 60: 24.29% 25.74% 22.84%
## 70: 24.11% 25.91% 22.30%
## 80: 23.97% 25.83% 22.12%
## 90: 23.85% 25.77% 21.93%
## 100: 23.79% 25.76% 21.81%
## 110: 23.73% 25.72% 21.74%
## 120: 23.68% 25.76% 21.59%
## 130: 23.60% 25.65% 21.54%
## 140: 23.65% 25.67% 21.62%
## 150: 23.66% 25.76% 21.55%
## 160: 23.54% 25.65% 21.43%
## 170: 23.52% 25.65% 21.38%
## 180: 23.50% 25.65% 21.35%
## 190: 23.46% 25.65% 21.26%
## 200: 23.46% 25.69% 21.22%
## 210: 23.42% 25.68% 21.16%
## 220: 23.44% 25.68% 21.19%
## 230: 23.44% 25.69% 21.18%
## 240: 23.39% 25.65% 21.14%
## 250: 23.36% 25.64% 21.07%
## 260: 23.36% 25.71% 21.01%
## 270: 23.31% 25.64% 20.99%
## 280: 23.37% 25.74% 21.01%
## 290: 23.39% 25.76% 21.02%
## 300: 23.36% 25.68% 21.05%
## 310: 23.36% 25.69% 21.04%
## 320: 23.36% 25.72% 20.99%
## 330: 23.38% 25.75% 21.01%
## 340: 23.37% 25.73% 21.02%
## 350: 23.41% 25.75% 21.06%
## 360: 23.37% 25.75% 20.98%
## 370: 23.38% 25.79% 20.97%
## 380: 23.35% 25.75% 20.95%
## 390: 23.32% 25.68% 20.95%
## 400: 23.29% 25.66% 20.92%
## 410: 23.35% 25.75% 20.94%
## 420: 23.30% 25.71% 20.88%
## 430: 23.35% 25.77% 20.93%
## 440: 23.33% 25.78% 20.87%
## 450: 23.31% 25.69% 20.94%
## 460: 23.37% 25.77% 20.98%
classes_test<-predict(FinalRF, test)
confusionMatrix(classes_test, test$GC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 3
## 1 10989 3068
## 3 3900 11821
##
## Accuracy : 0.766
## 95% CI : (0.7612, 0.7708)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.532
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7381
## Specificity : 0.7939
## Pos Pred Value : 0.7817
## Neg Pred Value : 0.7519
## Prevalence : 0.5000
## Detection Rate : 0.3690
## Detection Prevalence : 0.4721
## Balanced Accuracy : 0.7660
##
## 'Positive' Class : 1
##
| Algoritmo | Precisión |
|---|---|
| Árbol de Clasificación (Train) | 0.6749 |
| Árbol de Clasificación (Test) | 0.6708 |
| AdaBoost (Train) | 0.6451 |
| AdaBoost (Test) | 0.6454 |
| Random Forest (Train) | 1 |
| Random Forest (Test) | 0,7678 |