##1.- INTRODUCCIÓN
Con los datos de Wage de la libreria ISLR, se pretende crear un modelo logístico que permita predecir la probabilidad de que un trabajador cobre más de 250.000 dólares en función de la edad que tiene.
Tal como se describe en el capítulo Regresión logística simple y múltiple, para modelar la probabilidad de que una observación se clasifique en un grupo u otro dependiendo del valor que tome un predictor continuo, se necesita:
Que la variable respuesta sea cualitativa con dos o más niveles (la regresión logística trabaja mejor con solo dos niveles). En el ejemplo que aquí se estudia, la variable respuesta de interés es si wage > 250, cuyo resultado puede ser verdadero o falso.
Dado que la variable disponible wage es continua, se tiene que realizar una transformación. La función I(condición) devuelve un TRUE si se cumple la condición y FALSE de lo contrario.
##2.- DATOS
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
data(Wage)
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region jobclass
## 1. < HS Grad :268 2. Middle Atlantic :3000 1. Industrial :1544
## 2. HS Grad :971 1. New England : 0 2. Information:1456
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## health health_ins logwage wage
## 1. <=Good : 858 1. Yes:2083 Min. :3.000 Min. : 20.09
## 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447 1st Qu.: 85.38
## Median :4.653 Median :104.92
## Mean :4.654 Mean :111.70
## 3rd Qu.:4.857 3rd Qu.:128.68
## Max. :5.763 Max. :318.34
##
set.seed(12345)
muestra <- sample(Wage$wage, 6)
muestra
## [1] 94.07271 200.54326 75.35568 118.88436 118.88436 54.59815
summary(muestra)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 54.60 80.03 106.48 110.39 118.88 200.54
I(muestra>250)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
##3.- MODELO
# REGRESIÓN LOGÍSTICA CON POLINOMIO DE GRADO 4
#------------------------------------------------
# Creación de una variable binaria si wage > 250
Wage$wage_superior250 <- I(Wage$wage > 250)
# Ajuste del modelo logístico
modelo_logit <- glm(wage_superior250 ~ poly(age,4), family = "binomial",
data=Wage)
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
## wage_superior250
## 231655 FALSE
## 86582 FALSE
## 161300 FALSE
## 155159 FALSE
## 11443 FALSE
## 376662 FALSE
str(Wage)
## 'data.frame': 3000 obs. of 12 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins : Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
## $ wage_superior250: 'AsIs' logi FALSE FALSE FALSE FALSE FALSE FALSE ...
summary(modelo_logit)
##
## Call:
## glm(formula = wage_superior250 ~ poly(age, 4), family = "binomial",
## data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3110 -0.2607 -0.2488 -0.1791 3.7859
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.3012 0.3451 -12.465 < 2e-16 ***
## poly(age, 4)1 71.9642 26.1176 2.755 0.00586 **
## poly(age, 4)2 -85.7729 35.9043 -2.389 0.01690 *
## poly(age, 4)3 34.1626 19.6890 1.735 0.08272 .
## poly(age, 4)4 -47.4008 24.0909 -1.968 0.04912 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 730.53 on 2999 degrees of freedom
## Residual deviance: 701.22 on 2995 degrees of freedom
## AIC: 711.22
##
## Number of Fisher Scoring iterations: 9
anova(modelo_logit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: wage_superior250
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 2999 730.53
## poly(age, 4) 4 29.315 2995 701.22
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# Ajuste del modelo logístico
modelo_nulo <- glm(wage_superior250 ~ 1, family = "binomial",
data=Wage)
summary(modelo_nulo)
##
## Call:
## glm(formula = wage_superior250 ~ 1, family = "binomial", data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.231 -0.231 -0.231 -0.231 2.697
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.610 0.114 -31.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 730.53 on 2999 degrees of freedom
## Residual deviance: 730.53 on 2999 degrees of freedom
## AIC: 732.53
##
## Number of Fisher Scoring iterations: 6
stargazer(modelo_nulo, modelo_logit, type="text")
##
## ==============================================
## Dependent variable:
## ----------------------------
## wage_superior250
## (1) (2)
## ----------------------------------------------
## poly(age, 4)1 71.964***
## (26.118)
##
## poly(age, 4)2 -85.773**
## (35.904)
##
## poly(age, 4)3 34.163*
## (19.689)
##
## poly(age, 4)4 -47.401**
## (24.091)
##
## Constant -3.610*** -4.301***
## (0.114) (0.345)
##
## ----------------------------------------------
## Observations 3,000 3,000
## Log Likelihood -365.267 -350.610
## Akaike Inf. Crit. 732.534 711.220
## ==============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
R2 <- 1 - modelo_logit$deviance/modelo_logit$null.deviance
R2*100
## [1] 4.012768
Representación Gráfica
# INTERPOLACIÓN DE PUNTOS DENTRO DEL RANGO DEL PREDICTOR
# ---------------------------------------------------------
limites <- range(Wage$age)
nuevos_puntos <- seq(from= limites[1], to = limites[2], by = 1)
nuevos_puntos <- data.frame(age=nuevos_puntos)
# PREDICCIÓN DE LA VARIABLE RESPUESTA Y DEL ERROR ESTÁNDAR
# -----------------------------------------------------------
predicciones <- predict(modelo_logit, newdata = nuevos_puntos, se.fit = TRUE,
level = 0.95)
# CÁLCULO DEL INTERVALO DE CONFIANZA DEL 95%
# ---------------------------------------------------
limite_inferior <- predicciones$fit - 1.96 * predicciones$se.fit
limite_superior <- predicciones$fit + 1.96 * predicciones$se.fit
# TRANAFORMACIÓN DE LOS ODDS A PROBABILIDADES
# --------------------------------------------------
fit_logit <- exp(predicciones$fit) / (1 + exp(predicciones$fit))
limite_inferior_logit <- exp(limite_inferior) / (1 + exp(limite_inferior))
limite_superior_logit <- exp(limite_superior) / (1 + exp(limite_superior))
# RUG PLOT: REPRESENTACIÓN GRÁFICA CON GGPLOT2
# -------------------------------------------------
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
datos_curva <- data.frame(age = nuevos_puntos,
probabilidad = fit_logit,
limite_inferior_logit = limite_inferior_logit,
limite_superior_logit = limite_superior_logit)
p1 <- ggplot(Wage, aes(x = age, y = as.numeric(wage_superior250))) +
#La variable respuesta y se convierte de vector lógico a 1/0
geom_jitter(aes(color = as.factor(wage_superior250)), shape = "I",
size = 3, height = 0) +
geom_line(data = datos_curva, aes(y = probabilidad), color = "red") +
geom_line(data = datos_curva, aes(y = limite_inferior_logit),
linetype = "dashed") +
geom_line(data = datos_curva, aes(y = limite_superior_logit),
linetype = "dashed") +
theme_bw() +
labs(y = "P(wage > 250 | age)") +
theme(legend.position = "bottom") +
theme(plot.title = element_text(hjust = 0.5))
p1
p1_zoom <- p1 + lims (y = c(0,0.15)) +
labs(title = "Zoom región [0, 15", y = "")
grid.arrange(p1, p1_zoom, ncol =2,
top = "Modelo regresión logística con polinomio de grado 4")
## Warning: Removed 79 rows containing missing values (`geom_point()`).
## Warning: Removed 10 rows containing missing values (`geom_line()`).