U2A5: Regresión polinómica

EQUIPO 6: Angélica Payán Serna, Karen Gutiérrez Velásquez y Andrea Higuera Chávez

18/05/2021

Tema: Educación en México

Descarga de este código

xfun::embed_file("U2A5.Rmd")

Download U2A5.Rmd

Importación de datos

setwd("~/Estadistica")
library(DT)
library(readxl)
tasa <- read.csv("EducacionTasa.csv")

Tabla de datos

datatable(tasa)

El análisis se hará para la variable de primaria, ya que es la que tiene un comportamiento muy distinto a las demás

poly(tasa$primaria, degree = 4, raw = TRUE, simple = TRUE)[1:5,]
##             1        2        3        4
## [1,] 97.55905 9517.768 928544.5 90587917
## [2,] 97.96440 9597.024 940166.8 92102875
## [3,] 98.10871 9625.319 944327.6 92646764
## [4,] 98.08837 9621.329 943740.5 92569964
## [5,] 98.36495 9675.663 951746.1 93618458
poly(tasa$primaria, degree = 4, raw = FALSE, simple = TRUE)[1:5,]
##                1            2          3           4
## [1,] -0.24558354  0.268116662 -0.1861602  0.02624603
## [2,] -0.16052821  0.054698973  0.1020358 -0.17475748
## [3,] -0.13024823 -0.005099258  0.1437828 -0.14178987
## [4,] -0.13451581  0.002814262  0.1395526 -0.14840095
## [5,] -0.07648127 -0.090345462  0.1565709 -0.02714660

Modelo polinómico de grado 4

modelo_poli4 <- lm(periodo ~ poly(primaria, 4), data = tasa)
summary(modelo_poli4)
## 
## Call:
## lm(formula = periodo ~ poly(primaria, 4), data = tasa)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.275 -6.839 -1.188  5.439 13.735 
## 
## Coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)        2003.500      1.491 1343.714   <2e-16 ***
## poly(primaria, 4)1   18.369      7.890    2.328   0.0291 *  
## poly(primaria, 4)2   -1.071      7.890   -0.136   0.8932    
## poly(primaria, 4)3   -2.062      7.890   -0.261   0.7962    
## poly(primaria, 4)4    7.244      7.890    0.918   0.3681    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.89 on 23 degrees of freedom
## Multiple R-squared:  0.2164, Adjusted R-squared:  0.08008 
## F-statistic: 1.588 on 4 and 23 DF,  p-value: 0.2113
  • Representación gráfica

tasa = 2003.500 + 18.369Primaria − 1.071Primaria^2 - 2.062Primaria^3 + 7.244Primaria^4

# INTERPOLACIÓN DE PUNTOS DENTRO DEL RANGO DEL PREDICTOR

limites <- range(tasa$primaria)
nuevos_puntos <- seq(from = limites[1], to = limites[2], by = 1)
nuevos_puntos <- data.frame(primaria = nuevos_puntos)

# PREDICCIÓN DE LA VARIABLE RESPUESTA Y DEL ERROR ESTÁNDAR

predicciones <- predict(modelo_poli4, newdata = nuevos_puntos, se.fit = TRUE,
                        level = 0.95)

intervalo_conf <- data.frame(inferior = predicciones$fit -
                                        1.96*predicciones$se.fit,
                             superior = predicciones$fit +
                                        1.96*predicciones$se.fit)

attach(tasa)
plot(x = primaria, y = periodo, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: primaria ~ periodo")
points(x = nuevos_puntos$primaria, predicciones$fit, col = "red", pch = 20)
points(x = nuevos_puntos$primaria, intervalo_conf$inferior, col = "blue", pch = 4)
points(x = nuevos_puntos$primaria, intervalo_conf$superior, col = "blue", pch = 4)

attach(tasa)
## The following objects are masked from tasa (pos = 3):
## 
##     periodo, preescolar, primaria, secundaria
plot(x = primaria, y = periodo, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: primaria ~ periodo")
lines(x = nuevos_puntos$primaria, predicciones$fit, col = "red", lwd = 2)
lines(x = nuevos_puntos$primaria, intervalo_conf$inferior, col = "blue", lwd = 2)
lines(x = nuevos_puntos$primaria, intervalo_conf$superior, col = "blue", lwd = 2)

library(ggplot2)
ggplot(data = tasa, aes(x = primaria, y = periodo)) +
  geom_point(color = "red", alpha = 0.3) + 
  geom_smooth(method = "lm", formula = y ~ poly(x, 4), color = "red") +
  labs(title = "Polinomio de grado 4: Primaria ~ Periodo") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

## Comparación de modelos por contraste de hipótesis ANOVA

modelo_1 <- lm(periodo ~ primaria, data = tasa)
modelo_2 <- lm(periodo ~ poly(primaria, 2), data = tasa)
modelo_3 <- lm(periodo ~ poly(primaria, 3), data = tasa)
modelo_4 <- lm(periodo ~ poly(primaria, 4), data = tasa)
modelo_5 <- lm(periodo ~ poly(primaria, 5), data = tasa)

anova(modelo_1, modelo_2, modelo_3, modelo_4, modelo_5)
## Analysis of Variance Table
## 
## Model 1: periodo ~ primaria
## Model 2: periodo ~ poly(primaria, 2)
## Model 3: periodo ~ poly(primaria, 3)
## Model 4: periodo ~ poly(primaria, 4)
## Model 5: periodo ~ poly(primaria, 5)
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     26 1489.6                           
## 2     25 1488.4  1     1.148 0.0191 0.8914
## 3     24 1484.2  1     4.251 0.0706 0.7929
## 4     23 1431.7  1    52.473 0.8721 0.3605
## 5     22 1323.7  1   107.976 1.7945 0.1940

Comparación de modelos por Cross-Validation

library(boot)
cv_MSE_k10 <- rep(NA,10)

for (i in 1:10) {
  modelo <- glm(periodo ~ poly(primaria, i), data = tasa)
  set.seed(17)
  cv_MSE_k10[i] <- cv.glm(data = tasa, glmfit = modelo, K = 10)$delta[1]
}
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000

## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
p4 <- ggplot(data = data.frame(polinomio = 1:10, cv_MSE = cv_MSE_k10),
             aes(x = polinomio, y = cv_MSE)) +
      geom_point(colour = c("firebrick3")) +
      geom_path()
p4 <- p4 + theme(panel.grid.major = element_line(colour  =  'gray90'))
p4 <- p4 + theme(plot.title = element_text(face  =  'bold'))
p4 <- p4 + theme(panel.background = element_rect(fill  =  'gray98'))
p4 <- p4 + labs(title  =  'Test Error ~ Grado del polinomio')
p4 <- p4 + scale_x_continuous(breaks = 1:10)
p4