#Dafne Arenas,Michell Alppelgren, Mario Castillo, Vivi-Ann Flores, Macarena Baquedano, Sofia Aguilera, Nicolas Bachelet, Angie De La Torre, Laura Zeballos
#Cargar paquetes
library(car)
## Loading required package: carData
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(AER)
## Loading required package: sandwich
## Loading required package: survival
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)
#Cargar datos
Casen_2017 <- read_dta("Casen 2017.dta")
#Crear muestra
set.seed(123) #Establecer semilla para reproducibilidad
muestra <- Casen_2017 %>%
sample_n(5000, replace = FALSE)
#Seleccionar variables
datos <- select(muestra, edad, region, ecivil, sexo, e6a)
names(datos) <- c("Edad", "Region", "EstadoCivil", "Sexo", "Educacion")
#Convertir variables a factores
datos$Region <- factor(datos$Region)
datos$EstadoCivil <- factor(datos$EstadoCivil)
datos$Sexo <- factor(datos$Sexo)
#Crear variable categórica de edad
datos$Edad_cat <- cut(datos$Edad, breaks = 3, labels = c("joven", "adulto", "mayor"))
datos$Edad_cat <- factor(datos$Edad_cat)
#Crear variable categórica de postgrado
datos$Postgrado <- ifelse(datos$Educacion == 17, 1, 0)
datos$Postgrado <- factor(datos$Postgrado)
#Ajustar modelo logit
modelo_logit <- glm(Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil , family = binomial(link = "logit"), data = datos)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Ajustar modelo probit
modelo_probit <- glm(Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil , family = binomial(link = "probit"), data = datos)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Resumen de los modelos
summary(modelo_logit)
##
## Call:
## glm(formula = Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil,
## family = binomial(link = "logit"), data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4289 -0.1483 -0.0985 -0.0645 3.7433
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.54968 1166.20437 -0.018 0.985257
## Edad_catadulto 1.60258 0.42370 3.782 0.000155 ***
## Edad_catmayor 0.96935 0.66075 1.467 0.142359
## Sexo2 -0.14376 0.29227 -0.492 0.622804
## Region2 16.39789 1166.20443 0.014 0.988781
## Region3 15.20872 1166.20471 0.013 0.989595
## Region4 -0.14701 1592.45054 0.000 0.999926
## Region5 15.39724 1166.20443 0.013 0.989466
## Region6 14.49950 1166.20471 0.012 0.990080
## Region7 15.27265 1166.20450 0.013 0.989551
## Region8 15.33099 1166.20443 0.013 0.989511
## Region9 16.04415 1166.20439 0.014 0.989023
## Region10 15.97278 1166.20439 0.014 0.989072
## Region11 15.53666 1166.20472 0.013 0.989371
## Region12 -0.10891 1928.68421 0.000 0.999955
## Region13 16.85674 1166.20430 0.014 0.988467
## Region14 15.58975 1166.20450 0.013 0.989334
## Region15 15.20549 1166.20471 0.013 0.989597
## Region16 -0.05376 1691.13224 0.000 0.999975
## EstadoCivil2 0.75036 0.38755 1.936 0.052845 .
## EstadoCivil3 -15.41278 6688.72284 -0.002 0.998161
## EstadoCivil4 -16.56252 9545.72375 -0.002 0.998616
## EstadoCivil5 -0.97063 1.03537 -0.937 0.348521
## EstadoCivil6 0.58947 0.76983 0.766 0.443845
## EstadoCivil7 -0.78363 1.06939 -0.733 0.463689
## EstadoCivil8 0.18853 0.40500 0.466 0.641573
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.02 on 4999 degrees of freedom
## Residual deviance: 491.26 on 4974 degrees of freedom
## AIC: 543.26
##
## Number of Fisher Scoring iterations: 19
summary(modelo_probit)
##
## Call:
## glm(formula = Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil,
## family = binomial(link = "probit"), data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3984 -0.1551 -0.0998 -0.0593 3.7765
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.612e+00 2.641e+02 -0.025 0.980026
## Edad_catadulto 6.180e-01 1.626e-01 3.800 0.000144 ***
## Edad_catmayor 3.778e-01 2.485e-01 1.520 0.128397
## Sexo2 -5.949e-02 1.166e-01 -0.510 0.609962
## Region2 4.086e+00 2.641e+02 0.015 0.987654
## Region3 3.611e+00 2.641e+02 0.014 0.989089
## Region4 -4.092e-02 3.609e+02 0.000 0.999910
## Region5 3.702e+00 2.641e+02 0.014 0.988816
## Region6 3.434e+00 2.641e+02 0.013 0.989624
## Region7 3.658e+00 2.641e+02 0.014 0.988949
## Region8 3.676e+00 2.641e+02 0.014 0.988895
## Region9 3.953e+00 2.641e+02 0.015 0.988057
## Region10 3.881e+00 2.641e+02 0.015 0.988276
## Region11 3.744e+00 2.641e+02 0.014 0.988688
## Region12 -8.751e-03 4.377e+02 0.000 0.999984
## Region13 4.272e+00 2.641e+02 0.016 0.987093
## Region14 3.777e+00 2.641e+02 0.014 0.988588
## Region15 3.612e+00 2.641e+02 0.014 0.989087
## Region16 3.596e-04 3.836e+02 0.000 0.999999
## EstadoCivil2 2.913e-01 1.653e-01 1.763 0.077937 .
## EstadoCivil3 -3.784e+00 1.498e+03 -0.003 0.997985
## EstadoCivil4 -4.160e+00 2.161e+03 -0.002 0.998464
## EstadoCivil5 -3.403e-01 3.658e-01 -0.930 0.352270
## EstadoCivil6 2.256e-01 3.420e-01 0.660 0.509418
## EstadoCivil7 -2.488e-01 3.704e-01 -0.672 0.501674
## EstadoCivil8 8.094e-02 1.631e-01 0.496 0.619771
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.02 on 4999 degrees of freedom
## Residual deviance: 492.53 on 4974 degrees of freedom
## AIC: 544.53
##
## Number of Fisher Scoring iterations: 19
#Comprobar multicolinealidad
vif(modelo_logit)
## GVIF Df GVIF^(1/(2*Df))
## Edad_cat 1.524165 2 1.111112
## Sexo 1.035238 1 1.017466
## Region 1.027261 15 1.000897
## EstadoCivil 1.576147 7 1.033033
vif(modelo_probit)
## GVIF Df GVIF^(1/(2*Df))
## Edad_cat 1.684715 2 1.139283
## Sexo 1.032896 1 1.016315
## Region 1.042514 15 1.001389
## EstadoCivil 1.729954 7 1.039926
#Comprobar normalidad de los residuos
residuos_logit <- residuals(modelo_logit, type = "pearson")
residuos_probit <- residuals(modelo_probit, type = "pearson")
shapiro.test(residuos_logit)
##
## Shapiro-Wilk normality test
##
## data: residuos_logit
## W = 0.087601, p-value < 2.2e-16
shapiro.test(residuos_probit)
##
## Shapiro-Wilk normality test
##
## data: residuos_probit
## W = 0.086257, p-value < 2.2e-16
#Comprobar homocedasticidad
bptest(modelo_logit)
##
## studentized Breusch-Pagan test
##
## data: modelo_logit
## BP = 68.898, df = 25, p-value = 5.602e-06
bptest(modelo_probit)
##
## studentized Breusch-Pagan test
##
## data: modelo_probit
## BP = 68.898, df = 25, p-value = 5.602e-06
#Instalar y cargar paquete pROC
if (!requireNamespace("pROC", quietly = TRUE)) {
install.packages("pROC")
}
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#Evaluar bondad de ajuste con curva ROC
roc_logit <- roc(datos$Postgrado, fitted(modelo_logit))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_probit <- roc(datos$Postgrado, fitted(modelo_probit))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#Mostrar AUC para cada modelo
cat("AUC del modelo logit:", auc(roc_logit), "\n")
## AUC del modelo logit: 0.8020323
cat("AUC del modelo probit:", auc(roc_probit), "\n")
## AUC del modelo probit: 0.8000525
# Graficar las curvas ROC
plot(roc_logit, main = "Curvas ROC de los modelos logit y probit")
plot(roc_probit, add = TRUE, print.thres = FALSE, col = "blue")
legend("bottomright", legend = c("Logit", "Probit"), col = c("black", "blue"), lwd = 2)

# Comparación de modelos
logLik(modelo_logit)
## 'log Lik.' -245.6298 (df=26)
logLik(modelo_probit)
## 'log Lik.' -246.2667 (df=26)
lrtest(modelo_logit, modelo_probit)
## Likelihood ratio test
##
## Model 1: Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil
## Model 2: Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 26 -245.63
## 2 26 -246.27 0 1.2738 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1