Clase 9: Cross Validation

Cargamos paquetes a utilizar

library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)

Base de CASEN

base <- fread('CASEN 2020 valpo.csv')

Procesos iterativos

Es útil pensar la validación cruzada como lo que son: procesos iterativos. Un proceso iterativo es un tipo de proceso que debe repetirse. Lo central es cambiar los input del proceso, pero no la función que se aplica. Ejemplos de proceso iterativo:

for(i in 1:10){ ## Esto es un for
  if(i == 1){
    print('Ahora vamos a contar del 1 al 10')
  }
  print(paste0('Vamos en el número ',i))
  Sys.sleep(2)
}

## [1] "Ahora vamos a contar del 1 al 10"
## [1] "Vamos en el número 1"
## [1] "Vamos en el número 2"
## [1] "Vamos en el número 3"
## [1] "Vamos en el número 4"
## [1] "Vamos en el número 5"
## [1] "Vamos en el número 6"
## [1] "Vamos en el número 7"
## [1] "Vamos en el número 8"
## [1] "Vamos en el número 9"
## [1] "Vamos en el número 10"

O también:

x <- 1
while(x <= 10){
  if(x == 1){
    print('Ahora vamos a contar del 1 al 10')
  }
  print(x)
  Sys.sleep(1)
  x <- x + 1
}

## [1] "Ahora vamos a contar del 1 al 10"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

base <- base[!is.na(ytrabajocor) & activ == 1 & edad >= 15 & ocup_inf != 9 & inmigrante != 9,
             .(ytrabajocor,sexo,edad,esc2,ocup_inf,inmigrante)]
base <- base[complete.cases(base)]
base[,sexo := factor(sexo,levels = 1:2, labels = c('Hombre','Mujer'))]
base[,ocup_inf := factor(ocup_inf, levels = 1:2, labels = c('Formal','Informal'))]
base[,inmigrante := factor(inmigrante,levels = 0:1,c('Chileno','Inmigrante'))]

Regresión lineal múltiple

f <- formula(ytrabajocor ~ sexo  + inmigrante)
reg1 <- lm(formula = f, data = base)
summary(reg1)

## 
## Call:
## lm(formula = f, data = base)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
##  -665200  -339617  -186352    91648 16334383 
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)            665617      12750  52.205 < 0.0000000000000002 ***
## sexoMujer             -157265      18395  -8.549 < 0.0000000000000002 ***
## inmigranteInmigrante  -117598      44170  -2.662              0.00778 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 694600 on 5724 degrees of freedom
## Multiple R-squared:  0.01375,    Adjusted R-squared:  0.0134 
## F-statistic:  39.9 on 2 and 5724 DF,  p-value: < 0.00000000000000022

prediccion <- predict(reg1, data.table(sexo = "Mujer", inmigrante = "Chileno"))
prediccion

##        1 
## 508351.8

¿Como afectan las variables previamente incluidas al ingreso del trabajo de las personas?

f1 <- formula(ytrabajocor ~ .)
reg01 <- lm(formula = f1, data = base)
sum01 <- summary(reg01)
sum01

## 
## Call:
## lm(formula = f1, data = base)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1284151  -288866   -98392   133135 15412157 
## 
## Coefficients:
##                       Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)          -693161.1    49123.3 -14.111 < 0.0000000000000002 ***
## sexoMujer            -210514.6    16384.4 -12.849 < 0.0000000000000002 ***
## edad                    8608.3      621.5  13.851 < 0.0000000000000002 ***
## esc2                   82794.4     2437.9  33.962 < 0.0000000000000002 ***
## ocup_infInformal     -230666.3    18436.6 -12.511 < 0.0000000000000002 ***
## inmigranteInmigrante -120098.6    39153.8  -3.067              0.00217 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 612900 on 5721 degrees of freedom
## Multiple R-squared:  0.2324, Adjusted R-squared:  0.2318 
## F-statistic: 346.5 on 5 and 5721 DF,  p-value: < 0.00000000000000022

¿Y si incluimos efectos heterogeneos? Veamos como el hecho de ser mujer genera diferencia en el resto de las variables independientes:

f2 <- formula(ytrabajocor ~ edad + sexo + edad*sexo + esc2 + esc2*sexo + 
              ocup_inf + ocup_inf*sexo + inmigrante + inmigrante*sexo)
reg02 <- lm(formula = f2, data = base)
sum02 <- summary(reg02)
sum02

## 
## Call:
## lm(formula = f2, data = base)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1367703  -284241   -98102   132581 15315790 
## 
## Coefficients:
##                                 Estimate Std. Error t value
## (Intercept)                    -848582.8    62203.8 -13.642
## edad                              9766.0      794.9  12.286
## sexoMujer                       206748.6   100505.2   2.057
## esc2                             91311.1     3133.6  29.139
## ocup_infInformal               -238862.8    25233.0  -9.466
## inmigranteInmigrante           -143609.4    52624.1  -2.729
## edad:sexoMujer                   -3151.0     1272.8  -2.476
## sexoMujer:esc2                  -21855.5     4979.2  -4.389
## sexoMujer:ocup_infInformal       12909.0    36957.0   0.349
## sexoMujer:inmigranteInmigrante   56694.5    78688.2   0.720
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## edad                           < 0.0000000000000002 ***
## sexoMujer                                   0.03972 *  
## esc2                           < 0.0000000000000002 ***
## ocup_infInformal               < 0.0000000000000002 ***
## inmigranteInmigrante                        0.00637 ** 
## edad:sexoMujer                              0.01332 *  
## sexoMujer:esc2                            0.0000116 ***
## sexoMujer:ocup_infInformal                  0.72688    
## sexoMujer:inmigranteInmigrante              0.47125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 611900 on 5717 degrees of freedom
## Multiple R-squared:  0.2356, Adjusted R-squared:  0.2344 
## F-statistic: 195.7 on 9 and 5717 DF,  p-value: < 0.00000000000000022

Predicciones dentro de muestra

¿Y como sabemos que modelo es mejor?

Criterio de mayor R cuadrado o R cuadrado ajustado.
Criterio de menor error absoluto medio.
Criterio de menor raiz cuadrática media del error.

Los criterios anteriores son buenos cuando uno desea hacer predicciones dentro de muestra, es decir, intentar ver que tan bien se ajusta el modelo a los datos predichos. Como un ejemplo simple de entender:

f3 <- formula(ytrabajocor ~ esc2)
reg03 <- lm(formula = f3, data = base)
base[,ytrabajocor_predict := predict(reg03)]

ggplot() +
  geom_segment(base, mapping = aes(x=esc2, xend=esc2, y=ytrabajocor, yend=ytrabajocor_predict)) +
  geom_point(base, mapping = aes(x=esc2, y=ytrabajocor,color='Valores reales'),alpha=0.2) +
  geom_smooth(base, mapping = aes(x=esc2, y=ytrabajocor,color='Valores predichos'),
              method='lm',se=F) +
  theme_minimal() +
  labs(x='Escolaridad',y='Ingreso del trabajo',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 0.001,suffix = 'K',prefix = '$'))

EL RMSE y el MAE son aquellos índices que capturan que tanto error posee un modelo según el ajuste de los datos reales en base a los datos predichos. Volviendo al ejemplo de la clase, calculamos el MAE de ambos modelos:

base[,ytrabajocor_predict_M1 := predict(reg01)]
base[,ytrabajocor_predict_M2 := predict(reg02)]

MAE1 <- sum(abs(base$ytrabajocor_predict_M1-base$ytrabajocor))/nrow(base)
MAE2 <- sum(abs(base$ytrabajocor_predict_M2-base$ytrabajocor))/nrow(base)

data.table(MAE1,MAE2)

##        MAE1     MAE2
## 1: 340088.2 339042.2

Ahora calculamos manualmente el RMSE de ambos modelos:

RMSE1 <- (sum((base$ytrabajocor_predict_M1-base$ytrabajocor)^2)/nrow(base))^(1/2)
RMSE2 <- (sum((base$ytrabajocor_predict_M2-base$ytrabajocor)^2)/nrow(base))^(1/2)

data.table(RMSE1,RMSE2)

##       RMSE1    RMSE2
## 1: 612579.1 611329.9

Ahora comparamos los tres criterios en una tabla:

data.table(regresion=c('Modelo 01','Modelo 02'),
           `R2`=c(sum01$r.squared,sum02$r.squared),
           `R2 ajustado`=c(sum01$adj.r.squared,sum02$adj.r.squared),
           `Mean Absolute Error` = c(MAE(reg01$fitted.values,base$ytrabajocor),
                                     MAE(reg02$fitted.values,base$ytrabajocor)),
           `Root Mean Squared Error`=c(RMSE(reg01$fitted.values,base$ytrabajocor),
                                      RMSE(reg02$fitted.values,base$ytrabajocor)))

##    regresion        R2 R2 ajustado Mean Absolute Error Root Mean Squared Error
## 1: Modelo 01 0.2324278   0.2317569            340088.2                612579.1
## 2: Modelo 02 0.2355552   0.2343517            339042.2                611329.9

K-folds CV

set.seed(12345) ## setear una semilla
setupKCV <- trainControl(method = "cv" , number = 100)

predkfolds1<-train(f1,data=base,method="lm",trControl= setupKCV)
print(predkfolds1)

## Linear Regression 
## 
## 5727 samples
##    8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (100 fold) 
## Summary of sample sizes: 5668, 5670, 5669, 5671, 5670, 5671, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   554296.5  0.2914721  339526.6
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

predkfolds2<-train(f2,data=base,method="lm",trControl= setupKCV)
print(predkfolds2)

## Linear Regression 
## 
## 5727 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (100 fold) 
## Summary of sample sizes: 5671, 5670, 5670, 5670, 5669, 5670, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   554120.1  0.2961984  339712.7
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Repeated K-folds CV

set.seed(12345)## setear una semilla 
setupRKCV <- trainControl(method = "repeatedcv" , number = 5, repeats= 3)
predRKCV<-train(f2,data=base,method="lm",trControl= setupRKCV)
print(predRKCV)

## Linear Regression 
## 
## 5727 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 4582, 4581, 4581, 4582, 4582, 4580, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE   
##   607325.3  0.2395217  340095
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

K folds separando la muestra con porcentaje

set.seed(12345) ## setear una semilla
training.samples <- createDataPartition(base$ytrabajocor,p = 0.8, list = FALSE)
train.data  <- base[training.samples, ]
test.data <- base[-training.samples, ]

model <- lm(f2, data = train.data)

predictions01 <- predict(model,train.data)
predictions02 <- predict(model,test.data)

data.table(' ' = c('Dentro de muestra','Fuera de muestra'),
           R2 = c(R2(predictions01, train.data$ytrabajocor),
                  R2(predictions02, test.data$ytrabajocor)),
           RMSE = c(RMSE(predictions01, train.data$ytrabajocor),
                    RMSE(predictions02, test.data$ytrabajocor)),
           MAE = c(MAE(predictions01, train.data$ytrabajocor),
                   MAE(predictions02, test.data$ytrabajocor)))

##                             R2     RMSE      MAE
## 1: Dentro de muestra 0.2449414 588271.9 338435.4
## 2:  Fuera de muestra 0.2067234 696497.2 339928.4

Leave one out LOOCV

set.seed(12345)## setear una semilla
setupLOO <- trainControl(method = "LOOCV")

predLOO <- train(f2,data=base[1:1000],method="lm",trControl= setupLOO)
print(predLOO)

## Linear Regression 
## 
## 1000 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 999, 999, 999, 999, 999, 999, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   567920.9  0.3036904  372274.2
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE