#Dafne Arenas,Michell Alppelgren, Mario Castillo, Vivi-Ann Flores, Macarena Baquedano, Sofia Aguilera, Nicolas Bachelet, Angie De La Torre, Laura Zeballos
#Cargar paquetes
library(car)
## Loading required package: carData
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(AER)
## Loading required package: sandwich
## Loading required package: survival
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)

#Cargar datos
Casen_2017 <- read_dta("Casen 2017.dta")

#Crear muestra
set.seed(123) #Establecer semilla para reproducibilidad
muestra <- Casen_2017 %>% 
  sample_n(5000, replace = FALSE)

#Seleccionar variables
datos <- select(muestra, edad, region, ecivil, sexo, e6a)
names(datos) <- c("Edad", "Region", "EstadoCivil", "Sexo", "Educacion")

#Convertir variables a factores
datos$Region <- factor(datos$Region)
datos$EstadoCivil <- factor(datos$EstadoCivil)
datos$Sexo <- factor(datos$Sexo)

#Crear variable categórica de edad
datos$Edad_cat <- cut(datos$Edad, breaks = 3, labels = c("joven", "adulto", "mayor"))
datos$Edad_cat <- factor(datos$Edad_cat)

#Crear variable categórica de postgrado
datos$Postgrado <- ifelse(datos$Educacion == 17, 1, 0)
datos$Postgrado <- factor(datos$Postgrado)

#Ajustar modelo logit
modelo_logit <- glm(Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil , family = binomial(link = "logit"), data = datos)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Ajustar modelo probit
modelo_probit <- glm(Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil , family = binomial(link = "probit"), data = datos)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Resumen de los modelos
summary(modelo_logit)
## 
## Call:
## glm(formula = Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil, 
##     family = binomial(link = "logit"), data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4289  -0.1483  -0.0985  -0.0645   3.7433  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -21.54968 1166.20437  -0.018 0.985257    
## Edad_catadulto    1.60258    0.42370   3.782 0.000155 ***
## Edad_catmayor     0.96935    0.66075   1.467 0.142359    
## Sexo2            -0.14376    0.29227  -0.492 0.622804    
## Region2          16.39789 1166.20443   0.014 0.988781    
## Region3          15.20872 1166.20471   0.013 0.989595    
## Region4          -0.14701 1592.45054   0.000 0.999926    
## Region5          15.39724 1166.20443   0.013 0.989466    
## Region6          14.49950 1166.20471   0.012 0.990080    
## Region7          15.27265 1166.20450   0.013 0.989551    
## Region8          15.33099 1166.20443   0.013 0.989511    
## Region9          16.04415 1166.20439   0.014 0.989023    
## Region10         15.97278 1166.20439   0.014 0.989072    
## Region11         15.53666 1166.20472   0.013 0.989371    
## Region12         -0.10891 1928.68421   0.000 0.999955    
## Region13         16.85674 1166.20430   0.014 0.988467    
## Region14         15.58975 1166.20450   0.013 0.989334    
## Region15         15.20549 1166.20471   0.013 0.989597    
## Region16         -0.05376 1691.13224   0.000 0.999975    
## EstadoCivil2      0.75036    0.38755   1.936 0.052845 .  
## EstadoCivil3    -15.41278 6688.72284  -0.002 0.998161    
## EstadoCivil4    -16.56252 9545.72375  -0.002 0.998616    
## EstadoCivil5     -0.97063    1.03537  -0.937 0.348521    
## EstadoCivil6      0.58947    0.76983   0.766 0.443845    
## EstadoCivil7     -0.78363    1.06939  -0.733 0.463689    
## EstadoCivil8      0.18853    0.40500   0.466 0.641573    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 560.02  on 4999  degrees of freedom
## Residual deviance: 491.26  on 4974  degrees of freedom
## AIC: 543.26
## 
## Number of Fisher Scoring iterations: 19
summary(modelo_probit)
## 
## Call:
## glm(formula = Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil, 
##     family = binomial(link = "probit"), data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3984  -0.1551  -0.0998  -0.0593   3.7765  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -6.612e+00  2.641e+02  -0.025 0.980026    
## Edad_catadulto  6.180e-01  1.626e-01   3.800 0.000144 ***
## Edad_catmayor   3.778e-01  2.485e-01   1.520 0.128397    
## Sexo2          -5.949e-02  1.166e-01  -0.510 0.609962    
## Region2         4.086e+00  2.641e+02   0.015 0.987654    
## Region3         3.611e+00  2.641e+02   0.014 0.989089    
## Region4        -4.092e-02  3.609e+02   0.000 0.999910    
## Region5         3.702e+00  2.641e+02   0.014 0.988816    
## Region6         3.434e+00  2.641e+02   0.013 0.989624    
## Region7         3.658e+00  2.641e+02   0.014 0.988949    
## Region8         3.676e+00  2.641e+02   0.014 0.988895    
## Region9         3.953e+00  2.641e+02   0.015 0.988057    
## Region10        3.881e+00  2.641e+02   0.015 0.988276    
## Region11        3.744e+00  2.641e+02   0.014 0.988688    
## Region12       -8.751e-03  4.377e+02   0.000 0.999984    
## Region13        4.272e+00  2.641e+02   0.016 0.987093    
## Region14        3.777e+00  2.641e+02   0.014 0.988588    
## Region15        3.612e+00  2.641e+02   0.014 0.989087    
## Region16        3.596e-04  3.836e+02   0.000 0.999999    
## EstadoCivil2    2.913e-01  1.653e-01   1.763 0.077937 .  
## EstadoCivil3   -3.784e+00  1.498e+03  -0.003 0.997985    
## EstadoCivil4   -4.160e+00  2.161e+03  -0.002 0.998464    
## EstadoCivil5   -3.403e-01  3.658e-01  -0.930 0.352270    
## EstadoCivil6    2.256e-01  3.420e-01   0.660 0.509418    
## EstadoCivil7   -2.488e-01  3.704e-01  -0.672 0.501674    
## EstadoCivil8    8.094e-02  1.631e-01   0.496 0.619771    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 560.02  on 4999  degrees of freedom
## Residual deviance: 492.53  on 4974  degrees of freedom
## AIC: 544.53
## 
## Number of Fisher Scoring iterations: 19
#Comprobar multicolinealidad
vif(modelo_logit)
##                 GVIF Df GVIF^(1/(2*Df))
## Edad_cat    1.524165  2        1.111112
## Sexo        1.035238  1        1.017466
## Region      1.027261 15        1.000897
## EstadoCivil 1.576147  7        1.033033
vif(modelo_probit)
##                 GVIF Df GVIF^(1/(2*Df))
## Edad_cat    1.684715  2        1.139283
## Sexo        1.032896  1        1.016315
## Region      1.042514 15        1.001389
## EstadoCivil 1.729954  7        1.039926
#Comprobar normalidad de los residuos
residuos_logit <- residuals(modelo_logit, type = "pearson")
residuos_probit <- residuals(modelo_probit, type = "pearson")
shapiro.test(residuos_logit)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuos_logit
## W = 0.087601, p-value < 2.2e-16
shapiro.test(residuos_probit)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuos_probit
## W = 0.086257, p-value < 2.2e-16
#Comprobar homocedasticidad
bptest(modelo_logit)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_logit
## BP = 68.898, df = 25, p-value = 5.602e-06
bptest(modelo_probit)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_probit
## BP = 68.898, df = 25, p-value = 5.602e-06
#Instalar y cargar paquete pROC
if (!requireNamespace("pROC", quietly = TRUE)) {
  install.packages("pROC")
}
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
#Evaluar bondad de ajuste con curva ROC
roc_logit <- roc(datos$Postgrado, fitted(modelo_logit))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_probit <- roc(datos$Postgrado, fitted(modelo_probit))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#Mostrar AUC para cada modelo
cat("AUC del modelo logit:", auc(roc_logit), "\n")
## AUC del modelo logit: 0.8020323
cat("AUC del modelo probit:", auc(roc_probit), "\n")
## AUC del modelo probit: 0.8000525
# Graficar las curvas ROC
plot(roc_logit, main = "Curvas ROC de los modelos logit y probit")
plot(roc_probit, add = TRUE, print.thres = FALSE, col = "blue")
legend("bottomright", legend = c("Logit", "Probit"), col = c("black", "blue"), lwd = 2)

# Comparación de modelos
logLik(modelo_logit)
## 'log Lik.' -245.6298 (df=26)
logLik(modelo_probit)
## 'log Lik.' -246.2667 (df=26)
lrtest(modelo_logit, modelo_probit)
## Likelihood ratio test
## 
## Model 1: Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil
## Model 2: Postgrado ~ Edad_cat + Sexo + Region + EstadoCivil
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1  26 -245.63                         
## 2  26 -246.27  0 1.2738  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1