Tema: Educación en México
Importación de datos
setwd("~/Estadistica")
library(DT)
library(readxl)
tasa <- read.csv("EducacionTasa.csv")Tabla de datos
datatable(tasa)El análisis se hará para la variable de primaria, ya que es la que tiene un comportamiento muy distinto a las demás
poly(tasa$primaria, degree = 4, raw = TRUE, simple = TRUE)[1:5,]## 1 2 3 4
## [1,] 97.55905 9517.768 928544.5 90587917
## [2,] 97.96440 9597.024 940166.8 92102875
## [3,] 98.10871 9625.319 944327.6 92646764
## [4,] 98.08837 9621.329 943740.5 92569964
## [5,] 98.36495 9675.663 951746.1 93618458
poly(tasa$primaria, degree = 4, raw = FALSE, simple = TRUE)[1:5,]## 1 2 3 4
## [1,] -0.24558354 0.268116662 -0.1861602 0.02624603
## [2,] -0.16052821 0.054698973 0.1020358 -0.17475748
## [3,] -0.13024823 -0.005099258 0.1437828 -0.14178987
## [4,] -0.13451581 0.002814262 0.1395526 -0.14840095
## [5,] -0.07648127 -0.090345462 0.1565709 -0.02714660
Modelo polinómico de grado 4
modelo_poli4 <- lm(periodo ~ poly(primaria, 4), data = tasa)
summary(modelo_poli4)##
## Call:
## lm(formula = periodo ~ poly(primaria, 4), data = tasa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.275 -6.839 -1.188 5.439 13.735
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2003.500 1.491 1343.714 <2e-16 ***
## poly(primaria, 4)1 18.369 7.890 2.328 0.0291 *
## poly(primaria, 4)2 -1.071 7.890 -0.136 0.8932
## poly(primaria, 4)3 -2.062 7.890 -0.261 0.7962
## poly(primaria, 4)4 7.244 7.890 0.918 0.3681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.89 on 23 degrees of freedom
## Multiple R-squared: 0.2164, Adjusted R-squared: 0.08008
## F-statistic: 1.588 on 4 and 23 DF, p-value: 0.2113
- Representación gráfica
tasa = 2003.500 + 18.369Primaria − 1.071Primaria^2 - 2.062Primaria^3 + 7.244Primaria^4
# INTERPOLACIÓN DE PUNTOS DENTRO DEL RANGO DEL PREDICTOR
limites <- range(tasa$primaria)
nuevos_puntos <- seq(from = limites[1], to = limites[2], by = 1)
nuevos_puntos <- data.frame(primaria = nuevos_puntos)
# PREDICCIÓN DE LA VARIABLE RESPUESTA Y DEL ERROR ESTÁNDAR
predicciones <- predict(modelo_poli4, newdata = nuevos_puntos, se.fit = TRUE,
level = 0.95)
intervalo_conf <- data.frame(inferior = predicciones$fit -
1.96*predicciones$se.fit,
superior = predicciones$fit +
1.96*predicciones$se.fit)
attach(tasa)
plot(x = primaria, y = periodo, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: primaria ~ periodo")
points(x = nuevos_puntos$primaria, predicciones$fit, col = "red", pch = 20)
points(x = nuevos_puntos$primaria, intervalo_conf$inferior, col = "blue", pch = 4)
points(x = nuevos_puntos$primaria, intervalo_conf$superior, col = "blue", pch = 4)attach(tasa)## The following objects are masked from tasa (pos = 3):
##
## periodo, preescolar, primaria, secundaria
plot(x = primaria, y = periodo, pch = 20, col = "darkgrey")
title("Polinomio de grado 4: primaria ~ periodo")
lines(x = nuevos_puntos$primaria, predicciones$fit, col = "red", lwd = 2)
lines(x = nuevos_puntos$primaria, intervalo_conf$inferior, col = "blue", lwd = 2)
lines(x = nuevos_puntos$primaria, intervalo_conf$superior, col = "blue", lwd = 2)library(ggplot2)
ggplot(data = tasa, aes(x = primaria, y = periodo)) +
geom_point(color = "red", alpha = 0.3) +
geom_smooth(method = "lm", formula = y ~ poly(x, 4), color = "red") +
labs(title = "Polinomio de grado 4: Primaria ~ Periodo") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) ## Comparación de modelos por contraste de hipótesis ANOVA
modelo_1 <- lm(periodo ~ primaria, data = tasa)
modelo_2 <- lm(periodo ~ poly(primaria, 2), data = tasa)
modelo_3 <- lm(periodo ~ poly(primaria, 3), data = tasa)
modelo_4 <- lm(periodo ~ poly(primaria, 4), data = tasa)
modelo_5 <- lm(periodo ~ poly(primaria, 5), data = tasa)
anova(modelo_1, modelo_2, modelo_3, modelo_4, modelo_5)## Analysis of Variance Table
##
## Model 1: periodo ~ primaria
## Model 2: periodo ~ poly(primaria, 2)
## Model 3: periodo ~ poly(primaria, 3)
## Model 4: periodo ~ poly(primaria, 4)
## Model 5: periodo ~ poly(primaria, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 26 1489.6
## 2 25 1488.4 1 1.148 0.0191 0.8914
## 3 24 1484.2 1 4.251 0.0706 0.7929
## 4 23 1431.7 1 52.473 0.8721 0.3605
## 5 22 1323.7 1 107.976 1.7945 0.1940
Comparación de modelos por Cross-Validation
library(boot)
cv_MSE_k10 <- rep(NA,10)
for (i in 1:10) {
modelo <- glm(periodo ~ poly(primaria, i), data = tasa)
set.seed(17)
cv_MSE_k10[i] <- cv.glm(data = tasa, glmfit = modelo, K = 10)$delta[1]
}## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
## Warning in cv.glm(data = tasa, glmfit = modelo, K = 10): 'K' has been set to
## 9.000000
p4 <- ggplot(data = data.frame(polinomio = 1:10, cv_MSE = cv_MSE_k10),
aes(x = polinomio, y = cv_MSE)) +
geom_point(colour = c("firebrick3")) +
geom_path()
p4 <- p4 + theme(panel.grid.major = element_line(colour = 'gray90'))
p4 <- p4 + theme(plot.title = element_text(face = 'bold'))
p4 <- p4 + theme(panel.background = element_rect(fill = 'gray98'))
p4 <- p4 + labs(title = 'Test Error ~ Grado del polinomio')
p4 <- p4 + scale_x_continuous(breaks = 1:10)
p4