# ==========================================================
# MODELO DE ASISTENCIA ESCOLAR – CEDE
# RStudio | Base R
# ==========================================================

# ----------------------------------------------------------
# 1. LIBRERÍAS
# ----------------------------------------------------------
library(haven)
## Warning: package 'haven' was built under R version 4.5.2
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Cargando paquete requerido: zoo
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# ----------------------------------------------------------
# 2. CARGAR BASES
# ----------------------------------------------------------
buen_gobierno   <- read_dta("PANEL_BUEN_GOBIERNO_2021.dta")
caracteristicas <- read_dta("PANEL_CARACTERISTICAS_GENERALES_2022.dta")
educacion       <- read_dta("PANEL_EDUCACION_2021.dta")

# ----------------------------------------------------------
# 3. SELECCIÓN DE VARIABLES
# ----------------------------------------------------------

cg_limpia <- caracteristicas[, c(
  "codmpio",
  "ano",
  "IPM",
  "gini",
  "indrural",
  "disbogota",
  "pobreza"
)]

ed_limpia <- educacion[, c(
  "codmpio",
  "ano",
  "t_establ_o",
  "t_establ",
  "per_alfa",
  "ind_alfa",
  "asistesc_5_a_24"
)]

# ----------------------------------------------------------
# 4. UNIR BASES
# ----------------------------------------------------------
datos_unidos <- merge(
  cg_limpia,
  ed_limpia,
  by = c("codmpio", "ano"),
  all = TRUE
)

# ----------------------------------------------------------
# 5. LIMPIEZA DE DATOS
# ----------------------------------------------------------
datos_unidos <- na.omit(datos_unidos)

vars_num <- c(
  "IPM", "gini", "indrural", "disbogota", "pobreza",
  "t_establ_o", "t_establ", "per_alfa",
  "ind_alfa", "asistesc_5_a_24"
)

datos_unidos[vars_num] <- lapply(datos_unidos[vars_num], as.numeric)

cat("Observaciones:", nrow(datos_unidos), "\n")
## Observaciones: 984
cat("Variables:", ncol(datos_unidos), "\n\n")
## Variables: 12
# ----------------------------------------------------------
# 6. BASE PARA LOS MODELOS
# ----------------------------------------------------------

y_var  <- "asistesc_5_a_24"
x_vars <- c("indrural", "gini", "IPM", "disbogota")

df <- datos_unidos[, c(y_var, x_vars)]
df <- na.omit(df)

# ----------------------------------------------------------
# 7. MODELO MCO (NIVEL–NIVEL)
# ----------------------------------------------------------

fml_ols <- as.formula(
  paste(y_var, "~", paste(x_vars, collapse = " + "))
)

m_ols <- lm(fml_ols, data = df)
summary(m_ols)
## 
## Call:
## lm(formula = fml_ols, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.099  -3.475   0.318   4.094  15.403 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 105.725141   2.875897  36.762  < 2e-16 ***
## indrural     -5.374865   1.181544  -4.549 6.07e-06 ***
## gini        -64.841139   6.794240  -9.544  < 2e-16 ***
## IPM          -0.188010   0.020039  -9.382  < 2e-16 ***
## disbogota     0.002208   0.001510   1.462    0.144    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.566 on 979 degrees of freedom
## Multiple R-squared:  0.3551, Adjusted R-squared:  0.3525 
## F-statistic: 134.8 on 4 and 979 DF,  p-value: < 2.2e-16
# ----------------------------------------------------------
# 8. MODELO LOG–LIN
# ----------------------------------------------------------

df$log_asist <- log(df[[y_var]])

fml_loglin <- as.formula(
  paste("log_asist ~", paste(x_vars, collapse = " + "))
)

m_loglin <- lm(fml_loglin, data = df)
summary(m_loglin)
## 
## Call:
## lm(formula = fml_loglin, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6419 -0.0530  0.0171  0.0792  0.2760 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.985e+00  8.308e-02  60.001  < 2e-16 ***
## indrural    -7.436e-02  3.413e-02  -2.179   0.0296 *  
## gini        -1.278e+00  1.963e-01  -6.510 1.20e-10 ***
## IPM         -4.148e-03  5.789e-04  -7.165 1.53e-12 ***
## disbogota    5.792e-05  4.361e-05   1.328   0.1844    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1897 on 979 degrees of freedom
## Multiple R-squared:  0.2067, Adjusted R-squared:  0.2035 
## F-statistic: 63.79 on 4 and 979 DF,  p-value: < 2.2e-16
# ----------------------------------------------------------
# 9. DIAGNÓSTICOS – FUNCIÓN GENERAL
# ----------------------------------------------------------

diagnosticos <- function(modelo, nombre) {
  
  cat("\n==============================\n")
  cat("DIAGNÓSTICOS:", nombre, "\n")
  cat("==============================\n")
  
  res <- residuals(modelo)
  fit <- fitted(modelo)
  n   <- length(res)
  
  # Linealidad
  par(mfrow = c(2,2))
  for (v in colnames(model.matrix(modelo))[-1]) {
    plot(df[[v]], res,
         xlab = v, ylab = "Residuos",
         main = paste("Residuos vs", v))
    abline(h = 0, col = "red", lty = 2)
  }
  par(mfrow = c(1,1))
  
  # Heterocedasticidad (Breusch–Pagan)
  u2 <- res^2
  m_bp <- lm(u2 ~ fit)
  LM <- n * summary(m_bp)$r.squared
  pval <- 1 - pchisq(LM, df = length(coef(modelo)) - 1)
  
  cat("\nBreusch–Pagan\n")
  cat("LM =", LM, "\n")
  cat("p-value =", pval, "\n")
  
  plot(fit, res,
       xlab = "Valores ajustados",
       ylab = "Residuos",
       main = "Residuos vs Ajustados")
  abline(h = 0, col = "red", lty = 2)
  
  # Normalidad
  print(shapiro.test(res))
  
  qqnorm(res); qqline(res, col = "red")
  
  # Multicolinealidad
  X <- model.matrix(modelo)[, -1]
  VIF <- diag(solve(cor(X)))
  print(VIF)
  
  # RESET
  print(resettest(modelo, power = 2:3, type = "fitted"))
  
  # Autocorrelación
  print(dwtest(modelo))
}

# ----------------------------------------------------------
# 10. EJECUTAR DIAGNÓSTICOS
# ----------------------------------------------------------

diagnosticos(m_ols, "MCO nivel–nivel")
## 
## ==============================
## DIAGNÓSTICOS: MCO nivel–nivel 
## ==============================

## 
## Breusch–Pagan
## LM = 36.44478 
## p-value = 2.343822e-07

## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.91166, p-value < 2.2e-16

##  indrural      gini       IPM disbogota 
##  1.812836  1.175749  2.333562  1.693738 
## 
##  RESET test
## 
## data:  modelo
## RESET = 39.383, df1 = 2, df2 = 977, p-value < 2.2e-16
## 
## 
##  Durbin-Watson test
## 
## data:  modelo
## DW = 1.5984, p-value = 9.477e-11
## alternative hypothesis: true autocorrelation is greater than 0
diagnosticos(m_loglin, "LOG–LIN")
## 
## ==============================
## DIAGNÓSTICOS: LOG–LIN 
## ==============================

## 
## Breusch–Pagan
## LM = 11.72503 
## p-value = 0.01951759

## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.48468, p-value < 2.2e-16

##  indrural      gini       IPM disbogota 
##  1.812836  1.175749  2.333562  1.693738 
## 
##  RESET test
## 
## data:  modelo
## RESET = 42.482, df1 = 2, df2 = 977, p-value < 2.2e-16
## 
## 
##  Durbin-Watson test
## 
## data:  modelo
## DW = 1.8172, p-value = 0.001682
## alternative hypothesis: true autocorrelation is greater than 0
# ----------------------------------------------------------
# 11. GRÁFICOS OBSERVADO vs AJUSTADO
# ----------------------------------------------------------

par(mfrow = c(1,2))

plot(df[[y_var]], fitted(m_ols),
     xlab = "Asistencia observada",
     ylab = "Asistencia ajustada",
     main = "MCO")
abline(0,1,col="red",lty=2)

plot(df$log_asist, fitted(m_loglin),
     xlab = "Log(asistencia) observada",
     ylab = "Log(asistencia) ajustada",
     main = "LOG–LIN")
abline(0,1,col="red",lty=2)

par(mfrow = c(1,1))


# ----------------------------------------------------------
# 12. CRITERIOS DE INFORMACIÓN (AIC y BIC)
# ----------------------------------------------------------

cat("\n==============================\n")
## 
## ==============================
cat("CRITERIOS DE INFORMACIÓN\n")
## CRITERIOS DE INFORMACIÓN
cat("==============================\n")
## ==============================
# AIC
AIC_ols    <- AIC(m_ols)
AIC_loglin <- AIC(m_loglin)

# BIC
BIC_ols    <- BIC(m_ols)
BIC_loglin <- BIC(m_loglin)

# Mostrar resultados
cat("\nAIC\n")
## 
## AIC
cat("MCO nivel–nivel :", AIC_ols, "\n")
## MCO nivel–nivel : 6503.062
cat("LOG–LIN         :", AIC_loglin, "\n")
## LOG–LIN         : -472.216
cat("\nBIC\n")
## 
## BIC
cat("MCO nivel–nivel :", BIC_ols, "\n")
## MCO nivel–nivel : 6532.412
cat("LOG–LIN         :", BIC_loglin, "\n")
## LOG–LIN         : -442.8662
# Comparación automática
cat("\nMODELO PREFERIDO SEGÚN:\n")
## 
## MODELO PREFERIDO SEGÚN:
if (AIC_loglin < AIC_ols) {
  cat("- AIC: LOG–LIN\n")
} else {
  cat("- AIC: MCO nivel–nivel\n")
}
## - AIC: LOG–LIN
if (BIC_loglin < BIC_ols) {
  cat("- BIC: LOG–LIN\n")
} else {
  cat("- BIC: MCO nivel–nivel\n")
}
## - BIC: LOG–LIN
cat("\nCÓDIGO EJECUTADO CORRECTAMENTE\n")
## 
## CÓDIGO EJECUTADO CORRECTAMENTE