#1.- ENUNCIADO

Un estudio considera que existe relación entre el hecho de que un estudiante asista a clases de repaso de lectura (sí = 1, no = 0), la nota que obtiene en un examen de lectura estándar (realizado antes de iniciar las clases de repaso) y el sexo (hombre = 1, mujer = 0). Se quiere generar un modelo en el que a partir de las variables puntuación del examen y sexo, prediga la probabilidad de que el estudiante tenga que asistir a clases de repaso.

datos <- data.frame(sexo = c("hombre", "hombre", "mujer", "mujer", "mujer", "hombre",
                             "mujer", "hombre", "mujer", "mujer", "hombre", "hombre",
                             "hombre", "hombre", "mujer", "mujer", "hombre", "mujer",
                             "hombre", "mujer", "hombre", "mujer", "mujer", "hombre",
                             "hombre", "mujer", "mujer", "mujer", "hombre", "hombre",
                             "hombre", "mujer", "hombre", "mujer", "hombre", "mujer",
                             "mujer", "mujer", "mujer", "mujer", "hombre", "mujer",
                             "hombre", "mujer", "mujer", "mujer", "mujer", "hombre",
                             "mujer", "hombre", "mujer", "hombre", "mujer", "mujer",
                             "hombre", "hombre", "hombre", "hombre", "hombre", "hombre",
                             "hombre", "hombre", "hombre", "mujer", "hombre", "hombre",
                             "hombre", "hombre", "mujer", "hombre", "mujer", "hombre",
                             "hombre", "hombre", "mujer", "hombre", "mujer", "mujer",
                             "hombre", "mujer", "mujer", "mujer", "hombre", "hombre",
                             "hombre", "hombre", "hombre", "mujer", "mujer", "mujer",
                             "mujer", "hombre", "mujer", "mujer", "mujer", "mujer",
                             "mujer", "mujer", "mujer", "mujer","mujer", "mujer",
                             "hombre", "mujer", "hombre", "hombre", "mujer", "mujer",
                             "mujer", "hombre","mujer", "hombre", "mujer", "mujer",
                             "mujer", "hombre", "mujer", "hombre", "mujer", "hombre",
                             "mujer", "hombre", "mujer", "mujer", "mujer", "mujer",
                             "mujer", "mujer", "mujer", "mujer", "hombre", "mujer",
                             "hombre", "hombre", "hombre", "hombre", "hombre", "hombre",
                             "hombre", "mujer","mujer", "mujer", "hombre", "hombre",
                             "mujer", "mujer", "hombre", "mujer", "hombre", "hombre",
                             "hombre", "mujer", "mujer", "mujer", "mujer", "hombre",
                             "hombre", "mujer", "hombre", "hombre", "mujer", "hombre",
                             "hombre", "hombre", "hombre", "mujer", "hombre", "hombre",
                             "mujer", "mujer", "hombre", "hombre", "hombre", "hombre",
                             "hombre", "mujer", "mujer", "mujer", "mujer", "hombre",
                             "hombre", "hombre", "mujer", "hombre", "mujer", "hombre",
                             "hombre", "hombre", "mujer"),
                    examen_lectura = c(91, 77.5, 52.5, 54, 53.5, 62, 59, 51.5,
                                       61.5, 56.5, 47.5, 75, 47.5, 53.5, 50, 50,
                                       49, 59, 60, 60, 60.5, 50, 101, 60, 60,
                                       83.5, 61, 75, 84, 56.5, 56.5, 45, 60.5,
                                       77.5, 62.5, 70, 69, 62, 107.5, 54.5, 92.5,
                                       94.5, 65, 80, 45, 45, 66, 66, 57.5, 42.5,
                                       60, 64, 65, 47.5, 57.5, 55, 55, 76.5,
                                       51.5, 59.5, 59.5, 59.5, 55, 70, 66.5,
                                       84.5, 57.5, 125, 70.5, 79, 56, 75, 57.5,
                                       56, 67.5, 114.5, 70, 67, 60.5, 95, 65.5,
                                       85, 55, 63.5, 61.5, 60, 52.5, 65, 87.5,
                                       62.5, 66.5, 67, 117.5, 47.5, 67.5, 67.5,
                                       77, 73.5, 73.5, 68.5, 55, 92, 55, 55, 60,
                                       120.5, 56, 84.5, 60, 85, 93, 60, 65, 58.5,
                                       85, 67, 67.5, 65, 60, 47.5, 79, 80, 57.5,
                                       64.5, 65, 60, 85, 60, 58, 61.5, 60, 65,
                                       93.5, 52.5, 42.5, 75, 48.5, 64, 66, 82.5,
                                       52.5, 45.5, 57.5, 65, 46, 75, 100, 77.5,
                                       51.5, 62.5, 44.5, 51, 56, 58.5, 69, 65,
                                       60, 65, 65, 40, 55, 52.5, 54.5, 74, 55,
                                       60.5, 50, 48, 51, 55, 93.5, 61, 52.5,
                                       57.5, 60, 71, 65, 60, 55, 60, 77, 52.5,
                                       95, 50, 47.5, 50, 47, 71, 65),
                    clases_repaso = c("0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "0", "0", "0", "0", "0", "0",
                                      "0", "0", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1", "1", "1", "1",
                                      "1", "1", "1", "1", "1"))

datos$sexo <- as.factor(datos$sexo)
datos$clases_repaso  <- as.factor(datos$clases_repaso)
head(datos)
##     sexo examen_lectura clases_repaso
## 1 hombre           91.0             0
## 2 hombre           77.5             0
## 3  mujer           52.5             0
## 4  mujer           54.0             0
## 5  mujer           53.5             0
## 6 hombre           62.0             0
summary(datos)
##      sexo    examen_lectura  clases_repaso
##  hombre:93   Min.   : 40.0   0:130        
##  mujer :96   1st Qu.: 55.0   1: 59        
##              Median : 60.5                
##              Mean   : 64.9                
##              3rd Qu.: 70.0                
##              Max.   :125.0
str(datos)
## 'data.frame':    189 obs. of  3 variables:
##  $ sexo          : Factor w/ 2 levels "hombre","mujer": 1 1 2 2 2 1 2 1 2 2 ...
##  $ examen_lectura: num  91 77.5 52.5 54 53.5 62 59 51.5 61.5 56.5 ...
##  $ clases_repaso : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

#2.- ANÁLISIS DE LAS OBSERVACIONES

Las tablas de frecuencia así como representaciones gráficas de las observaciones son útiles para intuir si las variables independientes escogidas están relacionadas con la variable respuesta y, por lo tanto, ser buenos predictores.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
tabla <- table(datos$clases_repaso, datos$sexo,
               dnn = c("clases de repaso", "sexo"))
addmargins(tabla)
##                 sexo
## clases de repaso hombre mujer Sum
##              0       57    73 130
##              1       36    23  59
##              Sum     93    96 189
tabla_frecuencias <- prop.table(tabla)*100 
addmargins(tabla_frecuencias)
##                 sexo
## clases de repaso    hombre     mujer       Sum
##              0    30.15873  38.62434  68.78307
##              1    19.04762  12.16931  31.21693
##              Sum  49.20635  50.79365 100.00000
ggplot(data=datos, aes(x=clases_repaso, y=examen_lectura, colour=sexo))+
  geom_boxplot()+
  theme_bw()+
  theme(legend.position = "bottom")

#3.- MODELO LOGIT

modelo <- glm(clases_repaso ~ examen_lectura + sexo, data = datos,
              family = "binomial")
summary(modelo)
## 
## Call:
## glm(formula = clases_repaso ~ examen_lectura + sexo, family = "binomial", 
##     data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2079  -0.8954  -0.7243   1.2592   2.0412  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.18365    0.78559   1.507   0.1319  
## examen_lectura -0.02617    0.01223  -2.139   0.0324 *
## sexomujer      -0.64749    0.32484  -1.993   0.0462 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 224.64  on 186  degrees of freedom
## AIC: 230.64
## 
## Number of Fisher Scoring iterations: 4
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(modelo, type = "text")
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                          clases_repaso       
## ---------------------------------------------
## examen_lectura             -0.026**          
##                             (0.012)          
##                                              
## sexomujer                  -0.647**          
##                             (0.325)          
##                                              
## Constant                     1.184           
##                             (0.786)          
##                                              
## ---------------------------------------------
## Observations                  189            
## Log Likelihood             -112.319          
## Akaike Inf. Crit.           230.639          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01

Mujer (-0.647), se estima que las mujeres tienen una probabilidad inferior de acudir a clases de repaso que un hombre con la misma nota en el examen de lectura. El coeficiente es estadísticamente significativo al 5%.

Examen lectura (-0.026). Se estima que un incremento en la nota de lectura, con los demás factores constantes, provocaría un descenso en la probabilidad de contratar clases particulares.

exp(modelo$coefficients)
##    (Intercept) examen_lectura      sexomujer 
##      3.2662643      0.9741680      0.5233595
confint(modelo)
## Waiting for profiling to be done...
##                     2.5 %       97.5 %
## (Intercept)    -0.2973940  2.799267537
## examen_lectura -0.0518029 -0.003536647
## sexomujer      -1.2931216 -0.015658932

#4.- Representación de las curvas de probabilidades.

# Convertir las variables en numéricas
datos$clases_repaso <- as.numeric(as.character(datos$clases_repaso))
str(datos)
## 'data.frame':    189 obs. of  3 variables:
##  $ sexo          : Factor w/ 2 levels "hombre","mujer": 1 1 2 2 2 1 2 1 2 2 ...
##  $ examen_lectura: num  91 77.5 52.5 54 53.5 62 59 51.5 61.5 56.5 ...
##  $ clases_repaso : num  0 0 0 0 0 0 0 0 0 0 ...
# Crear un dataframe con la probabilidad siendo hombre
nuevos_valores_examen <- seq(from = min(datos$examen_lectura),
                             to = max(datos$examen_lectura), by = 0.5)

sexo <- as.factor(rep(x="hombre", length(nuevos_valores_examen)))

# Predicciones en base "response"
predicciones <- predict(object = modelo, 
                        newdata = data.frame(examen_lectura = nuevos_valores_examen,                                              sexo = sexo), 
                        type = "response")

# Nuevos puntos a representar
datos_curvas_hombre <- data.frame(examen_lectura = nuevos_valores_examen,
                                  sexo = sexo,
                                  clases_repaso = predicciones)
# Crear un dataframe con la probabilidad siendo hombre
nuevos_valores_examen <- seq(from = min(datos$examen_lectura),
                             to = max(datos$examen_lectura), by = 0.5)

sexo <- as.factor(rep(x="mujer", length(nuevos_valores_examen)))

# Predicciones en base "response"
predicciones <- predict(object = modelo, 
                        newdata = data.frame(examen_lectura = nuevos_valores_examen,                                              sexo = sexo), 
                        type = "response")

# Nuevos puntos a representar
datos_curvas_mujer <- data.frame(examen_lectura = nuevos_valores_examen,
                                  sexo = sexo,
                                  clases_repaso = predicciones)
datos_curvas <- rbind(datos_curvas_hombre, datos_curvas_mujer)
ggplot(data=datos, aes(x= examen_lectura, y = as.numeric(clases_repaso),
                       color = sexo))+
  geom_point()+
  geom_line(data = datos_curvas, aes(y = clases_repaso))+
  geom_line(data = datos_curvas, aes(y = clases_repaso))+
  theme_bw()+
  labs(title = "Diferencial de probabilidad", y = "P(clases repaso)")+
  theme(plot.title = element_text(size=12))+
  theme(legend.position = "bottom")

Se observa un efecto diferencial en la probabilidad de acudir a clases de repaso, pues la probabilidad del hombre es superior a la de la mujer para cualquier valor del examen de lectura.

##5.- EVALUACIÓN DEL MODELO

# Test de razón de verosimilitudes

dif_residuos <- modelo$null.deviance - modelo$deviance
df <- modelo$df.null - modelo$df.residual
p_value <- pchisq(q=dif_residuos, df = df, lower.tail =FALSE)
p_value
## [1] 0.006626401

El p valor es prácticamente cero, lo cual indica que el modelo es conjuntamente significativo.

##6.- R2 de McFaddeden

R2 <- 1 - modelo$deviance/modelo$null.deviance
R2*100
## [1] 4.275494

Al incluir las variables explicativas en el modelo, se mejora la log verosimilitud en el 4.27%

##7.- Comparación y clasificación

predicciones <- ifelse(test = modelo$fitted.values > 0.5, yes = 1, no = 0)
MC <- table(modelo$model$clases_repaso, predicciones, dnn = c("observados", "predichos"))
MC
##           predichos
## observados   0   1
##          0 129   1
##          1  56   3
library(vcd)
## Warning: package 'vcd' was built under R version 4.2.3
## Loading required package: grid
library(ggplot2)
library(mosaic)
## Warning: package 'mosaic' was built under R version 4.2.3
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:vcd':
## 
##     mplot
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
mosaic(MC, shade = T, colorize = T, 
       gp = gpar(fill = matrix(c("lightgreen", "pink","pink",  "lightgreen"),2, 2)))

Accuracy = (129+1)/(129+3+1+56)
Accuracy*100
## [1] 68.78307

La fiabilidad (accuracy) del modelo es del 68,78%.