##1.- INTRODUCCIÓN

Con los datos de Wage de la libreria ISLR, se pretende crear un modelo logístico que permita predecir la probabilidad de que un trabajador cobre más de 250.000 dólares en función de la edad que tiene.

Tal como se describe en el capítulo Regresión logística simple y múltiple, para modelar la probabilidad de que una observación se clasifique en un grupo u otro dependiendo del valor que tome un predictor continuo, se necesita:

Que la variable respuesta sea cualitativa con dos o más niveles (la regresión logística trabaja mejor con solo dos niveles). En el ejemplo que aquí se estudia, la variable respuesta de interés es si wage > 250, cuyo resultado puede ser verdadero o falso.

Dado que la variable disponible wage es continua, se tiene que realizar una transformación. La función I(condición) devuelve un TRUE si se cumple la condición y FALSE de lo contrario.

##2.- DATOS

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
data(Wage)
str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...
head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region               jobclass   
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456  
##  3. Some College   :650   3. East North Central:   0                        
##  4. College Grad   :685   4. West North Central:   0                        
##  5. Advanced Degree:426   5. South Atlantic    :   0                        
##                           6. East South Central:   0                        
##                           (Other)              :   0                        
##             health      health_ins      logwage           wage       
##  1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09  
##  2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38  
##                                      Median :4.653   Median :104.92  
##                                      Mean   :4.654   Mean   :111.70  
##                                      3rd Qu.:4.857   3rd Qu.:128.68  
##                                      Max.   :5.763   Max.   :318.34  
## 
set.seed(12345)
muestra <- sample(Wage$wage, 6)
muestra
## [1]  94.07271 200.54326  75.35568 118.88436 118.88436  54.59815
summary(muestra)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   54.60   80.03  106.48  110.39  118.88  200.54
I(muestra>250)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE

##3.- MODELO

# REGRESIÓN LOGÍSTICA CON POLINOMIO DE GRADO 4
#------------------------------------------------

# Creación de una variable binaria si wage > 250
Wage$wage_superior250 <- I(Wage$wage > 250)

# Ajuste del modelo logístico
modelo_logit <- glm(wage_superior250 ~ poly(age,4), family = "binomial",
                    data=Wage)
head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
##        wage_superior250
## 231655            FALSE
## 86582             FALSE
## 161300            FALSE
## 155159            FALSE
## 11443             FALSE
## 376662            FALSE
str(Wage)
## 'data.frame':    3000 obs. of  12 variables:
##  $ year            : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age             : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl          : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race            : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education       : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region          : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass        : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health          : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins      : Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage         : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage            : num  75 70.5 131 154.7 75 ...
##  $ wage_superior250: 'AsIs' logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
summary(modelo_logit)
## 
## Call:
## glm(formula = wage_superior250 ~ poly(age, 4), family = "binomial", 
##     data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3110  -0.2607  -0.2488  -0.1791   3.7859  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.3012     0.3451 -12.465  < 2e-16 ***
## poly(age, 4)1  71.9642    26.1176   2.755  0.00586 ** 
## poly(age, 4)2 -85.7729    35.9043  -2.389  0.01690 *  
## poly(age, 4)3  34.1626    19.6890   1.735  0.08272 .  
## poly(age, 4)4 -47.4008    24.0909  -1.968  0.04912 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 730.53  on 2999  degrees of freedom
## Residual deviance: 701.22  on 2995  degrees of freedom
## AIC: 711.22
## 
## Number of Fisher Scoring iterations: 9
anova(modelo_logit)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: wage_superior250
## 
## Terms added sequentially (first to last)
## 
## 
##              Df Deviance Resid. Df Resid. Dev
## NULL                          2999     730.53
## poly(age, 4)  4   29.315      2995     701.22
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# Ajuste del modelo logístico
modelo_nulo <- glm(wage_superior250 ~ 1, family = "binomial",
                    data=Wage)
summary(modelo_nulo)
## 
## Call:
## glm(formula = wage_superior250 ~ 1, family = "binomial", data = Wage)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.231  -0.231  -0.231  -0.231   2.697  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.610      0.114  -31.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 730.53  on 2999  degrees of freedom
## Residual deviance: 730.53  on 2999  degrees of freedom
## AIC: 732.53
## 
## Number of Fisher Scoring iterations: 6
stargazer(modelo_nulo, modelo_logit, type="text")
## 
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                         wage_superior250      
##                        (1)            (2)     
## ----------------------------------------------
## poly(age, 4)1                      71.964***  
##                                    (26.118)   
##                                               
## poly(age, 4)2                      -85.773**  
##                                    (35.904)   
##                                               
## poly(age, 4)3                       34.163*   
##                                    (19.689)   
##                                               
## poly(age, 4)4                      -47.401**  
##                                    (24.091)   
##                                               
## Constant            -3.610***      -4.301***  
##                      (0.114)        (0.345)   
##                                               
## ----------------------------------------------
## Observations          3,000          3,000    
## Log Likelihood       -365.267      -350.610   
## Akaike Inf. Crit.    732.534        711.220   
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01
R2 <- 1 - modelo_logit$deviance/modelo_logit$null.deviance
R2*100
## [1] 4.012768

Representación Gráfica

# INTERPOLACIÓN DE PUNTOS DENTRO DEL RANGO DEL PREDICTOR
# ---------------------------------------------------------

limites <- range(Wage$age)
nuevos_puntos <- seq(from= limites[1], to = limites[2], by = 1)
nuevos_puntos <- data.frame(age=nuevos_puntos)
# PREDICCIÓN DE LA VARIABLE RESPUESTA Y DEL ERROR ESTÁNDAR
# -----------------------------------------------------------

predicciones <- predict(modelo_logit, newdata = nuevos_puntos, se.fit = TRUE,
                        level = 0.95)
# CÁLCULO DEL INTERVALO DE CONFIANZA DEL 95%
# ---------------------------------------------------

limite_inferior <- predicciones$fit - 1.96 * predicciones$se.fit
limite_superior <- predicciones$fit + 1.96 * predicciones$se.fit
# TRANAFORMACIÓN DE LOS ODDS A PROBABILIDADES
# --------------------------------------------------

fit_logit <- exp(predicciones$fit) / (1 + exp(predicciones$fit))

limite_inferior_logit <- exp(limite_inferior) / (1 + exp(limite_inferior))

limite_superior_logit <- exp(limite_superior) / (1 + exp(limite_superior))
# RUG PLOT: REPRESENTACIÓN GRÁFICA CON GGPLOT2
# -------------------------------------------------

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
datos_curva <- data.frame(age = nuevos_puntos,
                          probabilidad = fit_logit,
                          limite_inferior_logit = limite_inferior_logit,
                          limite_superior_logit = limite_superior_logit)
p1 <- ggplot(Wage, aes(x = age, y = as.numeric(wage_superior250))) +
      #La variable respuesta y se convierte de vector lógico a 1/0
      geom_jitter(aes(color = as.factor(wage_superior250)), shape = "I",
                  size = 3, height = 0) + 
      geom_line(data = datos_curva, aes(y = probabilidad), color = "red") +
      geom_line(data = datos_curva, aes(y = limite_inferior_logit),
                linetype = "dashed") + 
      geom_line(data = datos_curva, aes(y = limite_superior_logit),
                linetype = "dashed") + 
      theme_bw() +
      labs(y = "P(wage > 250 | age)") + 
      theme(legend.position = "bottom") +
      theme(plot.title = element_text(hjust = 0.5))
p1 

p1_zoom <- p1 + lims (y = c(0,0.15)) +
  labs(title = "Zoom región [0, 15", y = "")

grid.arrange(p1, p1_zoom, ncol =2,
             top = "Modelo regresión logística con polinomio de grado 4")
## Warning: Removed 79 rows containing missing values (`geom_point()`).
## Warning: Removed 10 rows containing missing values (`geom_line()`).