Análisis de regresión no lineal

Librerias y Directorio de trabajo

setwd("~/R Scripts/Proyecto") # Directorio de trabajo.

library("pacman") # Importa biblioteca "pacman". Se utiliza para hacer una mejor gestión de paquetes.

p_load("base64enc", "htmltools", "mime", "xfun", "prettydoc","readr", "knitr","DT","dplyr", "ggplot2","plotly", "gganimate","gifski","scales", "readxl", "tidyverse","cluster", "factoextra","NbClust","tidyr", "hpackedbubble", "gridExtra", "psych", "GGally", "gridExtra", "lmtest", "corrplot", "car", "boot") # Paquetes necesarios para la elaboración.

Descarga este codigo

xfun::embed_file("U2A5.Rmd")

Download U2A5.Rmd

Descarga datos

xfun::embed_file("confirmadosVShogar.csv")

Download confirmadosVShogar.csv

regresion <- read.csv("confirmadosVShogar.csv")

#poly(regresion$Confirmados, degree = 4, raw = TRUE)

modelo_poli4 <- lm(Confirmados ~ poly(regresion$residential_percent_change_from_baseline, 4), data = regresion)
summary(modelo_poli4)
## 
## Call:
## lm(formula = Confirmados ~ poly(regresion$residential_percent_change_from_baseline, 
##     4), data = regresion)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -240.73  -77.13   -9.25   62.44  385.55 
## 
## Coefficients:
##                                                               Estimate
## (Intercept)                                                    176.221
## poly(regresion$residential_percent_change_from_baseline, 4)1   706.136
## poly(regresion$residential_percent_change_from_baseline, 4)2 -1101.411
## poly(regresion$residential_percent_change_from_baseline, 4)3  -379.155
## poly(regresion$residential_percent_change_from_baseline, 4)4   229.091
##                                                              Std. Error t value
## (Intercept)                                                       7.029  25.072
## poly(regresion$residential_percent_change_from_baseline, 4)1    125.928   5.607
## poly(regresion$residential_percent_change_from_baseline, 4)2    125.928  -8.746
## poly(regresion$residential_percent_change_from_baseline, 4)3    125.928  -3.011
## poly(regresion$residential_percent_change_from_baseline, 4)4    125.928   1.819
##                                                              Pr(>|t|)    
## (Intercept)                                                   < 2e-16 ***
## poly(regresion$residential_percent_change_from_baseline, 4)1 4.48e-08 ***
## poly(regresion$residential_percent_change_from_baseline, 4)2  < 2e-16 ***
## poly(regresion$residential_percent_change_from_baseline, 4)3  0.00281 ** 
## poly(regresion$residential_percent_change_from_baseline, 4)4  0.06982 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 125.9 on 316 degrees of freedom
## Multiple R-squared:  0.2758, Adjusted R-squared:  0.2666 
## F-statistic: 30.08 on 4 and 316 DF,  p-value: < 2.2e-16
# INTERPOLACIÓN DE PUNTOS DENTRO DEL RANGO DEL PREDICTOR
# -----------------------------------------------------------------------------

limites <- range(regresion$residential_percent_change_from_baseline)
nuevos_puntos <- seq(from = limites[1], to = limites[2], by = 1)
nuevos_puntos <- data.frame(residential_percent_change_from_baseline = nuevos_puntos)

# PREDICCIÓN DE LA VARIABLE RESPUESTA Y DEL ERROR ESTÁNDAR
# -----------------------------------------------------------------------------

predicciones <- predict(modelo_poli4, newdata = nuevos_puntos, se.fit = TRUE,
                        level = 0.95)
## Warning: 'newdata' had 31 rows but variables found have 321 rows
# CÁLCULO DEL INTERVALO DE CONFIANZA SUPERIOR E INFERIOR 95%
# -----------------------------------------------------------------------------

intervalo_conf <- data.frame(inferior = predicciones$fit -
                                        1.96*predicciones$se.fit,
                             superior = predicciones$fit +
                                        1.96*predicciones$se.fit)

attach(regresion)

plot(x = regresion$residential_percent_change_from_baseline, y = regresion$Confirmados, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: Confirmados ~ Hogar")
points(x = regresion$residential_percent_change_from_baseline, predicciones$fit, col = "red", pch = 20)
points(x = regresion$residential_percent_change_from_baseline, intervalo_conf$inferior, col = "blue", pch = 4)
points(x = regresion$residential_percent_change_from_baseline, intervalo_conf$superior, col = "blue", pch = 4)

plot(x = regresion$residential_percent_change_from_baseline, y = regresion$Confirmados, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: Confirmados ~ Hogar")
lines(x = regresion$residential_percent_change_from_baseline, predicciones$fit, col = "red", lwd = 2)
lines(x = regresion$residential_percent_change_from_baseline, intervalo_conf$inferior, col = "blue", lwd = 2)
lines(x = regresion$residential_percent_change_from_baseline, intervalo_conf$superior, col = "blue", lwd = 2)

ggplot(data = regresion, aes(x = residential_percent_change_from_baseline, y = Confirmados)) +
  geom_point(color = "grey30", alpha = 0.3) + 
  geom_smooth(method = "lm", formula = y ~ poly(x, 4), color = "red") +
  labs(title = "Polinomio de grado 4: Confirmados ~ Hogar") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

# Se ajustan modelos polinómicos de grado 1 a 5
# ----------------------------------------------------------------------------
modelo_1 <- lm(Confirmados ~ residential_percent_change_from_baseline, data = regresion)
modelo_2 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 2), data = regresion)
modelo_3 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 3), data = regresion)
modelo_4 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 4), data = regresion)
modelo_5 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 5), data = regresion)

anova(modelo_1, modelo_2, modelo_3, modelo_4, modelo_5)
## Analysis of Variance Table
## 
## Model 1: Confirmados ~ residential_percent_change_from_baseline
## Model 2: Confirmados ~ poly(residential_percent_change_from_baseline, 
##     2)
## Model 3: Confirmados ~ poly(residential_percent_change_from_baseline, 
##     3)
## Model 4: Confirmados ~ poly(residential_percent_change_from_baseline, 
##     4)
## Model 5: Confirmados ~ poly(residential_percent_change_from_baseline, 
##     5)
##   Res.Df     RSS Df Sum of Sq       F    Pr(>F)    
## 1    319 6420434                                   
## 2    318 5207328  1   1213105 76.9038 < 2.2e-16 ***
## 3    317 5063570  1    143758  9.1134  0.002745 ** 
## 4    316 5011088  1     52483  3.3271  0.069095 .  
## 5    315 4968908  1     42180  2.6739  0.103002    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

El cúbico es el mejor

Comparacion de modelos por cross validation

cv_MSE_k10 <- rep(NA,10)
for (i in 1:10) {
  modelo <- glm(Confirmados ~ poly(residential_percent_change_from_baseline, i), data = regresion)
  set.seed(17)
  cv_MSE_k10[i] <- cv.glm(data = regresion, glmfit = modelo, K = 10)$delta[1]
}

p4 <- ggplot(data = data.frame(polinomio = 1:10, cv_MSE = cv_MSE_k10),
             aes(x = polinomio, y = cv_MSE)) +
      geom_point(colour = c("firebrick3")) +
      geom_path()
p4 <- p4 + theme(panel.grid.major = element_line(colour  =  'gray90'))
p4 <- p4 + theme(plot.title = element_text(face  =  'bold'))
p4 <- p4 + theme(panel.background = element_rect(fill  =  'gray98'))
p4 <- p4 + labs(title  =  'Test Error ~ Grado del polinomio')
p4 <- p4 + scale_x_continuous(breaks = 1:10)
p4

El método de cross-validation k=10 también indica que el mejor modelo es el que emplea un polinomio de grado 3.