#1.- ENUNCIADO
Un estudio considera que existe relación entre el hecho de que un estudiante asista a clases de repaso de lectura (sí = 1, no = 0), la nota que obtiene en un examen de lectura estándar (realizado antes de iniciar las clases de repaso) y el sexo (hombre = 1, mujer = 0). Se quiere generar un modelo en el que a partir de las variables puntuación del examen y sexo, prediga la probabilidad de que el estudiante tenga que asistir a clases de repaso.
datos <- data.frame(sexo = c("hombre", "hombre", "mujer", "mujer", "mujer", "hombre",
"mujer", "hombre", "mujer", "mujer", "hombre", "hombre",
"hombre", "hombre", "mujer", "mujer", "hombre", "mujer",
"hombre", "mujer", "hombre", "mujer", "mujer", "hombre",
"hombre", "mujer", "mujer", "mujer", "hombre", "hombre",
"hombre", "mujer", "hombre", "mujer", "hombre", "mujer",
"mujer", "mujer", "mujer", "mujer", "hombre", "mujer",
"hombre", "mujer", "mujer", "mujer", "mujer", "hombre",
"mujer", "hombre", "mujer", "hombre", "mujer", "mujer",
"hombre", "hombre", "hombre", "hombre", "hombre", "hombre",
"hombre", "hombre", "hombre", "mujer", "hombre", "hombre",
"hombre", "hombre", "mujer", "hombre", "mujer", "hombre",
"hombre", "hombre", "mujer", "hombre", "mujer", "mujer",
"hombre", "mujer", "mujer", "mujer", "hombre", "hombre",
"hombre", "hombre", "hombre", "mujer", "mujer", "mujer",
"mujer", "hombre", "mujer", "mujer", "mujer", "mujer",
"mujer", "mujer", "mujer", "mujer","mujer", "mujer",
"hombre", "mujer", "hombre", "hombre", "mujer", "mujer",
"mujer", "hombre","mujer", "hombre", "mujer", "mujer",
"mujer", "hombre", "mujer", "hombre", "mujer", "hombre",
"mujer", "hombre", "mujer", "mujer", "mujer", "mujer",
"mujer", "mujer", "mujer", "mujer", "hombre", "mujer",
"hombre", "hombre", "hombre", "hombre", "hombre", "hombre",
"hombre", "mujer","mujer", "mujer", "hombre", "hombre",
"mujer", "mujer", "hombre", "mujer", "hombre", "hombre",
"hombre", "mujer", "mujer", "mujer", "mujer", "hombre",
"hombre", "mujer", "hombre", "hombre", "mujer", "hombre",
"hombre", "hombre", "hombre", "mujer", "hombre", "hombre",
"mujer", "mujer", "hombre", "hombre", "hombre", "hombre",
"hombre", "mujer", "mujer", "mujer", "mujer", "hombre",
"hombre", "hombre", "mujer", "hombre", "mujer", "hombre",
"hombre", "hombre", "mujer"),
examen_lectura = c(91, 77.5, 52.5, 54, 53.5, 62, 59, 51.5,
61.5, 56.5, 47.5, 75, 47.5, 53.5, 50, 50,
49, 59, 60, 60, 60.5, 50, 101, 60, 60,
83.5, 61, 75, 84, 56.5, 56.5, 45, 60.5,
77.5, 62.5, 70, 69, 62, 107.5, 54.5, 92.5,
94.5, 65, 80, 45, 45, 66, 66, 57.5, 42.5,
60, 64, 65, 47.5, 57.5, 55, 55, 76.5,
51.5, 59.5, 59.5, 59.5, 55, 70, 66.5,
84.5, 57.5, 125, 70.5, 79, 56, 75, 57.5,
56, 67.5, 114.5, 70, 67, 60.5, 95, 65.5,
85, 55, 63.5, 61.5, 60, 52.5, 65, 87.5,
62.5, 66.5, 67, 117.5, 47.5, 67.5, 67.5,
77, 73.5, 73.5, 68.5, 55, 92, 55, 55, 60,
120.5, 56, 84.5, 60, 85, 93, 60, 65, 58.5,
85, 67, 67.5, 65, 60, 47.5, 79, 80, 57.5,
64.5, 65, 60, 85, 60, 58, 61.5, 60, 65,
93.5, 52.5, 42.5, 75, 48.5, 64, 66, 82.5,
52.5, 45.5, 57.5, 65, 46, 75, 100, 77.5,
51.5, 62.5, 44.5, 51, 56, 58.5, 69, 65,
60, 65, 65, 40, 55, 52.5, 54.5, 74, 55,
60.5, 50, 48, 51, 55, 93.5, 61, 52.5,
57.5, 60, 71, 65, 60, 55, 60, 77, 52.5,
95, 50, 47.5, 50, 47, 71, 65),
clases_repaso = c("0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1"))
datos$sexo <- as.factor(datos$sexo)
datos$clases_repaso <- as.factor(datos$clases_repaso)
head(datos)
## sexo examen_lectura clases_repaso
## 1 hombre 91.0 0
## 2 hombre 77.5 0
## 3 mujer 52.5 0
## 4 mujer 54.0 0
## 5 mujer 53.5 0
## 6 hombre 62.0 0
summary(datos)
## sexo examen_lectura clases_repaso
## hombre:93 Min. : 40.0 0:130
## mujer :96 1st Qu.: 55.0 1: 59
## Median : 60.5
## Mean : 64.9
## 3rd Qu.: 70.0
## Max. :125.0
str(datos)
## 'data.frame': 189 obs. of 3 variables:
## $ sexo : Factor w/ 2 levels "hombre","mujer": 1 1 2 2 2 1 2 1 2 2 ...
## $ examen_lectura: num 91 77.5 52.5 54 53.5 62 59 51.5 61.5 56.5 ...
## $ clases_repaso : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#2.- ANÁLISIS DE LAS OBSERVACIONES
Las tablas de frecuencia así como representaciones gráficas de las observaciones son útiles para intuir si las variables independientes escogidas están relacionadas con la variable respuesta y, por lo tanto, ser buenos predictores.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
tabla <- table(datos$clases_repaso, datos$sexo,
dnn = c("clases de repaso", "sexo"))
addmargins(tabla)
## sexo
## clases de repaso hombre mujer Sum
## 0 57 73 130
## 1 36 23 59
## Sum 93 96 189
tabla_frecuencias <- prop.table(tabla)*100
addmargins(tabla_frecuencias)
## sexo
## clases de repaso hombre mujer Sum
## 0 30.15873 38.62434 68.78307
## 1 19.04762 12.16931 31.21693
## Sum 49.20635 50.79365 100.00000
ggplot(data=datos, aes(x=clases_repaso, y=examen_lectura, colour=sexo))+
geom_boxplot()+
theme_bw()+
theme(legend.position = "bottom")
#3.- MODELO LOGIT
modelo <- glm(clases_repaso ~ examen_lectura + sexo, data = datos,
family = "binomial")
summary(modelo)
##
## Call:
## glm(formula = clases_repaso ~ examen_lectura + sexo, family = "binomial",
## data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2079 -0.8954 -0.7243 1.2592 2.0412
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.18365 0.78559 1.507 0.1319
## examen_lectura -0.02617 0.01223 -2.139 0.0324 *
## sexomujer -0.64749 0.32484 -1.993 0.0462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 224.64 on 186 degrees of freedom
## AIC: 230.64
##
## Number of Fisher Scoring iterations: 4
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(modelo, type = "text")
##
## =============================================
## Dependent variable:
## ---------------------------
## clases_repaso
## ---------------------------------------------
## examen_lectura -0.026**
## (0.012)
##
## sexomujer -0.647**
## (0.325)
##
## Constant 1.184
## (0.786)
##
## ---------------------------------------------
## Observations 189
## Log Likelihood -112.319
## Akaike Inf. Crit. 230.639
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Mujer (-0.647), se estima que las mujeres tienen una probabilidad inferior de acudir a clases de repaso que un hombre con la misma nota en el examen de lectura. El coeficiente es estadísticamente significativo al 5%.
Examen lectura (-0.026). Se estima que un incremento en la nota de lectura, con los demás factores constantes, provocaría un descenso en la probabilidad de contratar clases particulares.
exp(modelo$coefficients)
## (Intercept) examen_lectura sexomujer
## 3.2662643 0.9741680 0.5233595
confint(modelo)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.2973940 2.799267537
## examen_lectura -0.0518029 -0.003536647
## sexomujer -1.2931216 -0.015658932
#4.- Representación de las curvas de probabilidades.
# Convertir las variables en numéricas
datos$clases_repaso <- as.numeric(as.character(datos$clases_repaso))
str(datos)
## 'data.frame': 189 obs. of 3 variables:
## $ sexo : Factor w/ 2 levels "hombre","mujer": 1 1 2 2 2 1 2 1 2 2 ...
## $ examen_lectura: num 91 77.5 52.5 54 53.5 62 59 51.5 61.5 56.5 ...
## $ clases_repaso : num 0 0 0 0 0 0 0 0 0 0 ...
# Crear un dataframe con la probabilidad siendo hombre
nuevos_valores_examen <- seq(from = min(datos$examen_lectura),
to = max(datos$examen_lectura), by = 0.5)
sexo <- as.factor(rep(x="hombre", length(nuevos_valores_examen)))
# Predicciones en base "response"
predicciones <- predict(object = modelo,
newdata = data.frame(examen_lectura = nuevos_valores_examen, sexo = sexo),
type = "response")
# Nuevos puntos a representar
datos_curvas_hombre <- data.frame(examen_lectura = nuevos_valores_examen,
sexo = sexo,
clases_repaso = predicciones)
# Crear un dataframe con la probabilidad siendo hombre
nuevos_valores_examen <- seq(from = min(datos$examen_lectura),
to = max(datos$examen_lectura), by = 0.5)
sexo <- as.factor(rep(x="mujer", length(nuevos_valores_examen)))
# Predicciones en base "response"
predicciones <- predict(object = modelo,
newdata = data.frame(examen_lectura = nuevos_valores_examen, sexo = sexo),
type = "response")
# Nuevos puntos a representar
datos_curvas_mujer <- data.frame(examen_lectura = nuevos_valores_examen,
sexo = sexo,
clases_repaso = predicciones)
datos_curvas <- rbind(datos_curvas_hombre, datos_curvas_mujer)
ggplot(data=datos, aes(x= examen_lectura, y = as.numeric(clases_repaso),
color = sexo))+
geom_point()+
geom_line(data = datos_curvas, aes(y = clases_repaso))+
geom_line(data = datos_curvas, aes(y = clases_repaso))+
theme_bw()+
labs(title = "Diferencial de probabilidad", y = "P(clases repaso)")+
theme(plot.title = element_text(size=12))+
theme(legend.position = "bottom")
Se observa un efecto diferencial en la probabilidad de acudir a clases de repaso, pues la probabilidad del hombre es superior a la de la mujer para cualquier valor del examen de lectura.
##5.- EVALUACIÓN DEL MODELO
# Test de razón de verosimilitudes
dif_residuos <- modelo$null.deviance - modelo$deviance
df <- modelo$df.null - modelo$df.residual
p_value <- pchisq(q=dif_residuos, df = df, lower.tail =FALSE)
p_value
## [1] 0.006626401
El p valor es prácticamente cero, lo cual indica que el modelo es conjuntamente significativo.
##6.- R2 de McFaddeden
R2 <- 1 - modelo$deviance/modelo$null.deviance
R2*100
## [1] 4.275494
Al incluir las variables explicativas en el modelo, se mejora la log verosimilitud en el 4.27%
##7.- Comparación y clasificación
predicciones <- ifelse(test = modelo$fitted.values > 0.5, yes = 1, no = 0)
MC <- table(modelo$model$clases_repaso, predicciones, dnn = c("observados", "predichos"))
MC
## predichos
## observados 0 1
## 0 129 1
## 1 56 3
library(vcd)
## Warning: package 'vcd' was built under R version 4.2.3
## Loading required package: grid
library(ggplot2)
library(mosaic)
## Warning: package 'mosaic' was built under R version 4.2.3
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:vcd':
##
## mplot
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
mosaic(MC, shade = T, colorize = T,
gp = gpar(fill = matrix(c("lightgreen", "pink","pink", "lightgreen"),2, 2)))
Accuracy = (129+1)/(129+3+1+56)
Accuracy*100
## [1] 68.78307
La fiabilidad (accuracy) del modelo es del 68,78%.