Análisis de regresión no lineal
Librerias y Directorio de trabajo
setwd("~/R Scripts/Proyecto") # Directorio de trabajo.
library("pacman") # Importa biblioteca "pacman". Se utiliza para hacer una mejor gestión de paquetes.
p_load("base64enc", "htmltools", "mime", "xfun", "prettydoc","readr", "knitr","DT","dplyr", "ggplot2","plotly", "gganimate","gifski","scales", "readxl", "tidyverse","cluster", "factoextra","NbClust","tidyr", "hpackedbubble", "gridExtra", "psych", "GGally", "gridExtra", "lmtest", "corrplot", "car", "boot") # Paquetes necesarios para la elaboración.Descarga datos
xfun::embed_file("confirmadosVShogar.csv")Download confirmadosVShogar.csv
regresion <- read.csv("confirmadosVShogar.csv")
#poly(regresion$Confirmados, degree = 4, raw = TRUE)
modelo_poli4 <- lm(Confirmados ~ poly(regresion$residential_percent_change_from_baseline, 4), data = regresion)
summary(modelo_poli4)##
## Call:
## lm(formula = Confirmados ~ poly(regresion$residential_percent_change_from_baseline,
## 4), data = regresion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -240.73 -77.13 -9.25 62.44 385.55
##
## Coefficients:
## Estimate
## (Intercept) 176.221
## poly(regresion$residential_percent_change_from_baseline, 4)1 706.136
## poly(regresion$residential_percent_change_from_baseline, 4)2 -1101.411
## poly(regresion$residential_percent_change_from_baseline, 4)3 -379.155
## poly(regresion$residential_percent_change_from_baseline, 4)4 229.091
## Std. Error t value
## (Intercept) 7.029 25.072
## poly(regresion$residential_percent_change_from_baseline, 4)1 125.928 5.607
## poly(regresion$residential_percent_change_from_baseline, 4)2 125.928 -8.746
## poly(regresion$residential_percent_change_from_baseline, 4)3 125.928 -3.011
## poly(regresion$residential_percent_change_from_baseline, 4)4 125.928 1.819
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## poly(regresion$residential_percent_change_from_baseline, 4)1 4.48e-08 ***
## poly(regresion$residential_percent_change_from_baseline, 4)2 < 2e-16 ***
## poly(regresion$residential_percent_change_from_baseline, 4)3 0.00281 **
## poly(regresion$residential_percent_change_from_baseline, 4)4 0.06982 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 125.9 on 316 degrees of freedom
## Multiple R-squared: 0.2758, Adjusted R-squared: 0.2666
## F-statistic: 30.08 on 4 and 316 DF, p-value: < 2.2e-16
# INTERPOLACIÓN DE PUNTOS DENTRO DEL RANGO DEL PREDICTOR
# -----------------------------------------------------------------------------
limites <- range(regresion$residential_percent_change_from_baseline)
nuevos_puntos <- seq(from = limites[1], to = limites[2], by = 1)
nuevos_puntos <- data.frame(residential_percent_change_from_baseline = nuevos_puntos)
# PREDICCIÓN DE LA VARIABLE RESPUESTA Y DEL ERROR ESTÁNDAR
# -----------------------------------------------------------------------------
predicciones <- predict(modelo_poli4, newdata = nuevos_puntos, se.fit = TRUE,
level = 0.95)## Warning: 'newdata' had 31 rows but variables found have 321 rows
# CÁLCULO DEL INTERVALO DE CONFIANZA SUPERIOR E INFERIOR 95%
# -----------------------------------------------------------------------------
intervalo_conf <- data.frame(inferior = predicciones$fit -
1.96*predicciones$se.fit,
superior = predicciones$fit +
1.96*predicciones$se.fit)
attach(regresion)
plot(x = regresion$residential_percent_change_from_baseline, y = regresion$Confirmados, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: Confirmados ~ Hogar")
points(x = regresion$residential_percent_change_from_baseline, predicciones$fit, col = "red", pch = 20)
points(x = regresion$residential_percent_change_from_baseline, intervalo_conf$inferior, col = "blue", pch = 4)
points(x = regresion$residential_percent_change_from_baseline, intervalo_conf$superior, col = "blue", pch = 4)plot(x = regresion$residential_percent_change_from_baseline, y = regresion$Confirmados, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: Confirmados ~ Hogar")
lines(x = regresion$residential_percent_change_from_baseline, predicciones$fit, col = "red", lwd = 2)
lines(x = regresion$residential_percent_change_from_baseline, intervalo_conf$inferior, col = "blue", lwd = 2)
lines(x = regresion$residential_percent_change_from_baseline, intervalo_conf$superior, col = "blue", lwd = 2)ggplot(data = regresion, aes(x = residential_percent_change_from_baseline, y = Confirmados)) +
geom_point(color = "grey30", alpha = 0.3) +
geom_smooth(method = "lm", formula = y ~ poly(x, 4), color = "red") +
labs(title = "Polinomio de grado 4: Confirmados ~ Hogar") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))# Se ajustan modelos polinómicos de grado 1 a 5
# ----------------------------------------------------------------------------
modelo_1 <- lm(Confirmados ~ residential_percent_change_from_baseline, data = regresion)
modelo_2 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 2), data = regresion)
modelo_3 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 3), data = regresion)
modelo_4 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 4), data = regresion)
modelo_5 <- lm(Confirmados ~ poly(residential_percent_change_from_baseline, 5), data = regresion)
anova(modelo_1, modelo_2, modelo_3, modelo_4, modelo_5)## Analysis of Variance Table
##
## Model 1: Confirmados ~ residential_percent_change_from_baseline
## Model 2: Confirmados ~ poly(residential_percent_change_from_baseline,
## 2)
## Model 3: Confirmados ~ poly(residential_percent_change_from_baseline,
## 3)
## Model 4: Confirmados ~ poly(residential_percent_change_from_baseline,
## 4)
## Model 5: Confirmados ~ poly(residential_percent_change_from_baseline,
## 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 319 6420434
## 2 318 5207328 1 1213105 76.9038 < 2.2e-16 ***
## 3 317 5063570 1 143758 9.1134 0.002745 **
## 4 316 5011088 1 52483 3.3271 0.069095 .
## 5 315 4968908 1 42180 2.6739 0.103002
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
El cúbico es el mejor
Comparacion de modelos por cross validation
cv_MSE_k10 <- rep(NA,10)
for (i in 1:10) {
modelo <- glm(Confirmados ~ poly(residential_percent_change_from_baseline, i), data = regresion)
set.seed(17)
cv_MSE_k10[i] <- cv.glm(data = regresion, glmfit = modelo, K = 10)$delta[1]
}
p4 <- ggplot(data = data.frame(polinomio = 1:10, cv_MSE = cv_MSE_k10),
aes(x = polinomio, y = cv_MSE)) +
geom_point(colour = c("firebrick3")) +
geom_path()
p4 <- p4 + theme(panel.grid.major = element_line(colour = 'gray90'))
p4 <- p4 + theme(plot.title = element_text(face = 'bold'))
p4 <- p4 + theme(panel.background = element_rect(fill = 'gray98'))
p4 <- p4 + labs(title = 'Test Error ~ Grado del polinomio')
p4 <- p4 + scale_x_continuous(breaks = 1:10)
p4 El método de cross-validation k=10 también indica que el mejor modelo es el que emplea un polinomio de grado 3.