1 Librerías usuales
2 Primer vistazo al dataset: variables, su distribución empírica, su correlación
- 2.1 Matriz y plot de correlación entre las variables
3 Regresión lineal
4 ANOVA (comparación de medias de muchas poblaciones)
5 ANOVA 2 Factores
6 ANCOVA (ANOVA + regression, comparación de rectas)
- 6.1 two sample t-test (= varianza, y != varianza, Welch)
- 6.2 Modelo con interacción vs. Modelo aditivo
7 Análisis de los residuos
8 Ridge regression & Lasso
9 Ajuste Robusto para identificar obs atípicas
10 Selección de variables, R^2 adj, Cp de Mallows

1 Librerías usuales

library(tidyverse)
library(magrittr) # %<>%
library(glue) # glue
library(broom) # augment, glance, tidy
library(skimr) # skim
library(GGally) # ggpairs
library(gghighlight) # gghighlight
library(ggrepel) # geom_label_repel
# library(tidylog) # loguea mensajitos cuando usás dplyr
library(ggridges) # geom_density_ridges: me gusta más comparar densidades que boxplots

2 Primer vistazo al dataset: variables, su distribución empírica, su correlación

fn <- "low birth weight infants.txt"
bebes <- read_table(fn, col_types = cols(toxemia = col_factor(NULL)))

glimpse(bebes)

## Observations: 100
## Variables: 6
## $ headcirc <int> 27, 29, 30, 28, 29, 23, 22, 26, 27, 25, 23, 26, 27, 27,…
## $ length   <int> 41, 40, 38, 38, 38, 32, 33, 38, 30, 34, 32, 39, 38, 39,…
## $ gestage  <int> 29, 31, 33, 31, 30, 25, 27, 29, 28, 29, 26, 30, 29, 29,…
## $ birthwt  <int> 1360, 1490, 1490, 1180, 1200, 680, 620, 1060, 1320, 830…
## $ momage   <int> 37, 34, 32, 37, 29, 19, 20, 25, 27, 32, 26, 29, 24, 26,…
## $ toxemia  <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1…

# skim(bebes)

2.1 Matriz y plot de correlación entre las variables

bebes %>% select_if(is.numeric) %>% cor() %>% round(3)

##          headcirc length gestage birthwt momage
## headcirc    1.000  0.713   0.781   0.799  0.132
## length      0.713  1.000   0.675   0.816  0.218
## gestage     0.781  0.675   1.000   0.660  0.266
## birthwt     0.799  0.816   0.660   1.000  0.155
## momage      0.132  0.218   0.266   0.155  1.000

ggpairs(bebes, progress = F)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3 Regresión lineal

3.1 F test para TODOS los β = 0

$$ H_{0}: _{j} \ F =

{p-1,n-p} H{0}) $$

El $F$ es el test para toda la regresión. Los $t$ son tests independientes para ver si cada variable, al agregarse por separado, reduce significativamente el RSS.

modelo <- lm(headcirc ~ ., data = bebes) # . usa todas las variables
summary(modelo)

## 
## Call:
## lm(formula = headcirc ~ ., data = bebes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0190 -0.6712 -0.0364  0.3334  8.0421 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.2097216  2.1285705   3.387  0.00103 ** 
## length       0.0082711  0.0653434   0.127  0.89954    
## gestage      0.5261922  0.0835553   6.298 9.62e-09 ***
## birthwt      0.0042555  0.0008867   4.799 5.99e-06 ***
## momage      -0.0300651  0.0222312  -1.352  0.17950    
## toxemia1    -0.5160581  0.3696445  -1.396  0.16597    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.269 on 94 degrees of freedom
## Multiple R-squared:  0.7615, Adjusted R-squared:  0.7488 
## F-statistic: 60.03 on 5 and 94 DF,  p-value: < 2.2e-16

3.2 Graficar la recta del ajuste (caso 1 sola covariable)

modelo_cars <- lm(dist ~ speed, data = cars)
ggplot(data = augment(modelo_cars)) +
  aes(x = speed) +
  geom_segment(aes(xend = speed, y = .fitted, yend = dist), color = "Tomato", alpha = .5) +
  geom_point(mapping = aes(y = dist)) +
  # geom_point(mapping = aes(y = .fitted), shape = 4, color = "Red") +
  geom_abline(intercept = modelo_cars$coefficients[[1]],
              slope = modelo_cars$coefficients[[2]],
              color = "Tomato") +
  labs(title = "Regresión lineal con una sola covariable")

3.3 F test para ALGUNOS β = 0

Compara dos modelos, uno con todas las variables disponibles, otro con un subset:

modelo_1 <- lm(headcirc ~ gestage + birthwt + momage + length + toxemia, data = bebes)
modelo_2 <- lm(headcirc ~ gestage + birthwt, data = bebes)

comparacion <- anova(modelo_1, modelo_2)
comparacion

Res.Df	RSS	Df	Sum of Sq	F	Pr(>F)
94	151.3779	NA	NA	NA	NA
97	157.4231	-3	-6.045176	1.251276	0.2957051

# Es igual que hacer:
RSS_H <- sum(modelo_2$residuals^2)
RSS <- sum(modelo_1$residuals^2)

n <- length(modelo_1$residuals)
p <- modelo_1$rank
q <- modelo_1$rank - modelo_2$rank
F_obs <- ( (RSS_H - RSS)/RSS ) * ( (n - p)/q )
p_value <- pf(F_obs, df1 = q, df2 = n - p, lower.tail = F)

glue("q = {q}, n-p = {n-p}, F obs = {round(F_obs, 4)}, p-value = {round(p_value, 4)}")

## q = 3, n-p = 94, F obs = 1.2513, p-value = 0.2957

3.4 Matriz de covarianza de los estimadores (beta hat)

\[ \hat{\mathrm{Var}} [\hat{\beta}] = s^2 (X^T X)^{-1} \]

X <- model.matrix(modelo_1)
Xt_X_inv <- solve(t(X) %*% X)
var_betas <- sigma(modelo_1)^2 * Xt_X_inv
var_betas

##              (Intercept)       gestage       birthwt        momage
## (Intercept)  4.530812275 -1.102993e-01  1.228657e-03 -1.746837e-03
## gestage     -0.110299335  6.981485e-03 -2.735342e-05 -2.958101e-04
## birthwt      0.001228657 -2.735342e-05  7.862980e-07  1.683570e-06
## momage      -0.001746837 -2.958101e-04  1.683570e-06  4.942264e-04
## length      -0.073271626 -1.352937e-03 -3.718980e-05 -1.432177e-04
## toxemia1     0.323957829 -1.581191e-02  9.620515e-05  5.322625e-05
##                    length      toxemia1
## (Intercept) -7.327163e-02  3.239578e-01
## gestage     -1.352937e-03 -1.581191e-02
## birthwt     -3.718980e-05  9.620515e-05
## momage      -1.432177e-04  5.322625e-05
## length       4.269765e-03 -8.247217e-05
## toxemia1    -8.247217e-05  1.366371e-01

3.5 Los estimadores no cambian si centramos las variables

Salvo $\beta_{0}$, obviamente.

modelo_cars <- lm(dist ~ speed, data = cars)
modelo_cars_centrado <- lm(dist ~ scale(speed, center = T, scale = F), data = cars)

bind_rows(
  tidy(modelo_cars$coefficients) %>% mutate(modelo = "sin centrar"),
  tidy(modelo_cars_centrado$coefficients) %>% mutate(modelo = "centrado")
)

## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")

## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")

names	x	modelo
(Intercept)	-17.579095	sin centrar
speed	3.932409	sin centrar
(Intercept)	42.980000	centrado
scale(speed, center = T, scale = F)	3.932409	centrado

4 ANOVA (comparación de medias de muchas poblaciones)

Generalización del t-test, que sólo compara las medias de dos poblaciones.

algodon <-
  tribble(
    ~p, ~"1", ~"2", ~"3", ~"4", ~"5",
    15,  7,  7, 15, 11,  9,
    20, 12, 17, 12, 18, 18,
    25, 14, 18, 18, 19, 19,
    30, 19, 25, 22, 19, 23,
    35,  7, 10, 11, 15, 11
  ) %>%
  gather("muestra", "resistencia", -p) %>%
  select(-muestra) %>% arrange(p) %>% mutate(p = factor(p))

modelo_algodon <- lm(resistencia ~ p, data = algodon)

4.1 Visualizar diferencia de medias entre niveles del factor

Primero nos interesa visualizar la diferencia de medias entre niveles del factor.

ggplot(data = algodon) +
  aes(x = p, y = resistencia) +
  geom_boxplot(width = .6) +
  labs(title = "Diferencia de medias entre niveles del factor",
       subtitle = "Exploración pre-ANOVA")

ggplot(data = algodon) +
  aes(x = resistencia, y = p) +
  geom_density_ridges(scale = 1, quantile_lines = T, quantiles = 2) +
  labs(title = "Diferencia de mediana y dist empírica entre niveles",
       subtitle = "Exploración pre-ANOVA")

## Picking joint bandwidth of 1.33

4.2 Tabla de ANOVA:

anova te da el p-valor general de si el factor explica o no a la Y:

anova(modelo_algodon)

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
p	4	475.76	118.94	14.75682	9.1e-06
Residuals	20	161.20	8.06	NA	NA

4.3 Otras formas de hacer ANOVA:

summary(lm) te da el mismo p-valor y el p-valor para cada dummy por separado:

summary(modelo_algodon)

## 
## Call:
## lm(formula = resistencia ~ p, data = algodon)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -3.8   -2.6    0.4    1.4    5.2 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.800      1.270   7.719 2.02e-07 ***
## p20            5.600      1.796   3.119 0.005409 ** 
## p25            7.800      1.796   4.344 0.000315 ***
## p30           11.800      1.796   6.572 2.11e-06 ***
## p35            1.000      1.796   0.557 0.583753    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.839 on 20 degrees of freedom
## Multiple R-squared:  0.7469, Adjusted R-squared:  0.6963 
## F-statistic: 14.76 on 4 and 20 DF,  p-value: 9.128e-06

aov no parece agregar nada nuevo:

aov(modelo_algodon)

## Call:
##    aov(formula = modelo_algodon)
## 
## Terms:
##                      p Residuals
## Sum of Squares  475.76    161.20
## Deg. of Freedom      4        20
## 
## Residual standard error: 2.839014
## Estimated effects may be unbalanced

4.4 IC de Scheffé para diferencias de medias

… Ver ANCOVA.Rmd de la clase

4.5 IC simultáneos de Tukey para diferencias de medias

# Comparaciones entre pares de medias por tukey para ANOVA
anova_algodon <- aov(resistencia ~ p, data = algodon)
intervalos_tukey <- TukeyHSD(anova_algodon, "p", ordered = T, conf.level = 0.95)

# plot(intervalos_tukey) # Mas rápido y más feo

tidy(intervalos_tukey) %>%
  mutate(contiene_al_cero = conf.low <= 0 & conf.high >= 0) %>%
  ggplot() + 
  aes(y = comparison, color = contiene_al_cero) +
  geom_vline(xintercept = 0, color = "Red", linetype = "dotted") +
  geom_point(aes(x = conf.low), shape = 91, size = 5) +
  geom_point(aes(x = estimate)) +
  geom_point(aes(x = conf.high), shape = 93, size = 5) +
  geom_text(aes(label = comparison, x = conf.high + 2)) +
  geom_segment(aes(x = conf.low, xend = conf.high, y = comparison, yend = comparison)) +
  labs(title = "Intervalos de Tukey para los contrastes del ANOVA",
       x = "Diferencia de medias estimada",
       y = "Contraste") +
  scale_color_manual(values = c("SeaGreen", "Red")) +
  guides(color = F) +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

5 ANOVA 2 Factores

compuestos <- read_tsv("anova_ej_2.tsv") %>%
  mutate_at(vars(A, B, REP), as.factor)

## Parsed with column specification:
## cols(
##   A = col_integer(),
##   B = col_integer(),
##   TIEMPO = col_double(),
##   REP = col_integer()
## )

5.1 Visualizar diferencia entre medias de cada nivel de cada factor

ggplot(compuestos) +
  aes(x = A, y = TIEMPO) +
  geom_boxplot()

ggplot(compuestos) +
  aes(x = B, y = TIEMPO) +
  geom_boxplot()

compuestos %>%
  mutate(grupoAB = paste0(A, "+", B)) %>%
  ggplot() +
  aes(x = grupoAB, y = TIEMPO) +
  geom_boxplot() +
  stat_summary(fun.y = mean, geom = "line", aes(group = A))

5.2 Visualizar interacción entre factores

compuestos %>%
  group_by(A, B) %>%
  summarise(tiempo_medio = mean(TIEMPO)) %>%
  ggplot() +
  aes(x = A, y = tiempo_medio, color = B, group = B) +
  geom_line() +
  geom_point() +
  labs(title = "Interacción entre dos factores",
       subtitle = "Se cruzan las líneas: hay IX")

5.3 Testear si hay interacción entre factores

Ver si es significativo el F test para la interacción, A:B. En este caso NO lo es (p-valor 0.3444):

anova_compuestos <- aov(TIEMPO ~ A * B, data = compuestos)
summary(anova_compuestos)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## A            2  6.660   3.330   79.92 4.74e-05 ***
## B            1  0.083   0.083    2.00    0.207    
## A:B          2  0.107   0.053    1.28    0.344    
## Residuals    6  0.250   0.042                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.4 IC simultáneos de Tukey para ANOVA 2 factores

IC_tukey <- TukeyHSD(anova_compuestos, c("A", "B"), ordered = T, conf.level = 0.95)

# plot(IC_tukey) # Rápido pero feo

tidy(IC_tukey) %>%
  mutate(contiene_al_cero = conf.low <= 0 & conf.high >= 0) %>%
  ggplot() +
  facet_grid(term ~ ., scales = "free") +
  aes(y = comparison, color = contiene_al_cero) +
  geom_point(aes(x = estimate)) +
  geom_segment(aes(x = conf.low, xend = conf.high, yend = comparison)) +
  geom_vline(xintercept = 0, color = "Tomato", linetype = "dotted") +
  scale_color_manual(values = c("ForestGreen", "Red")) +
  guides(color = F) +
  labs(title = "Tukey IC para contrastes",
       subtile = "Dos factores")

6 ANCOVA (ANOVA + regression, comparación de rectas)

El ANCOVA evalúa si las medias de Y son iguales entre niveles de un factor, controlando por los efectos de una o más variables continuas que no son de interés principal (las covariables).

colesterol <- read_csv("colesterol.txt") %>%
  mutate(
    tratamiento = as.logical(tratamiento),
    tratamiento.f = factor(tratamiento)
  )

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   edad = col_integer(),
##   reducccol = col_double(),
##   tratamiento = col_integer()
## )

ggplot(colesterol) +
  aes(x = edad, y = reducccol, color = tratamiento.f) +
  geom_point() +
  geom_smooth(aes(group = tratamiento.f), method = "lm", se = F)

ggplot(colesterol) +
  geom_boxplot(aes(x = tratamiento.f, y = reducccol)) +
  coord_flip() +
  labs(title = "Diferencia de medias aparente",
       subtitle = "En la variable bajo análisis")

ggplot(colesterol) +
  geom_boxplot(aes(x = tratamiento, y = edad)) +
  coord_flip() +
  labs(title = "Diferencias entre casos y controles",
       subtitle = "Para una variable distinta de la evaluada")

Acá se testea si la diferencia entre las medias de reducccol (y) en ambas poblaciones (tratamiento vs. no tratamiento) es distinta de cero, pero sin tomar en cuenta la diferencia de edades entre los grupos tratamiento y control.

6.1 two sample t-test (= varianza, y != varianza, Welch)

El t-test compara las medias entre dos muestras (de dos subpoblaciones, casos y controles por ej.), pero no contempla variables extra. Los tests siguientes no contemplan la edad y por lo tanto son significativos erróneamente:

reduccol_tratamiento <- colesterol %>% filter(tratamiento) %>% pull(reducccol)
reduccol_control <- colesterol %>% filter(!tratamiento) %>% pull(reducccol)

t.test(reduccol_control, reduccol_tratamiento, var.equal = T)

## 
##  Two Sample t-test
## 
## data:  reduccol_control and reduccol_tratamiento
## t = 4.0679, df = 38, p-value = 0.0002307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   5.30861 15.82639
## sample estimates:
## mean of x mean of y 
##   10.5770    0.0095

t.test(reduccol_control, reduccol_tratamiento, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  reduccol_control and reduccol_tratamiento
## t = 4.0679, df = 37.546, p-value = 0.0002342
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   5.30652 15.82848
## sample estimates:
## mean of x mean of y 
##   10.5770    0.0095

6.2 Modelo con interacción vs. Modelo aditivo

Este primer test nos permite descartar la hipótesis de la interacción, porque se ve que edad:tratamiento=1 no es significativo:

modelo_colesterol_ix <- lm(reducccol ~ edad * tratamiento.f, data = colesterol) # modelo con interaccion
summary(modelo_colesterol_ix)

## 
## Call:
## lm(formula = reducccol ~ edad * tratamiento.f, data = colesterol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.6496 -3.6753  0.1382  2.6847 11.1795 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             49.2062     6.3317   7.771 3.28e-09 ***
## edad                    -0.9780     0.1583  -6.179 4.01e-07 ***
## tratamiento.fTRUE       13.8116    10.7917   1.280    0.209    
## edad:tratamiento.fTRUE  -0.0757     0.2148  -0.352    0.727    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.496 on 36 degrees of freedom
## Multiple R-squared:  0.8023, Adjusted R-squared:  0.7858 
## F-statistic:  48.7 on 3 and 36 DF,  p-value: 9.36e-13

Descartado el modelo con interacción, testeamos el modelo aditivo, que acá da bien. La edad influye mucho en reducccol y, controlando por esa variable, el tratamiento también tiene un efecto significativo!

modelo_colesterol_aditivo <- lm(reducccol ~ edad + tratamiento.f, data = colesterol) # modelo aditivo
summary(modelo_colesterol_aditivo)

## 
## Call:
## lm(formula = reducccol ~ edad + tratamiento.f, data = colesterol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9990 -3.5873  0.3339  2.6313 10.9996 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        50.8301     4.2919  11.843 3.76e-14 ***
## edad               -1.0191     0.1057  -9.641 1.23e-11 ***
## tratamiento.fTRUE  10.1195     2.5648   3.946 0.000342 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.442 on 37 degrees of freedom
## Multiple R-squared:  0.8016, Adjusted R-squared:  0.7909 
## F-statistic: 74.76 on 2 and 37 DF,  p-value: 1.007e-13

Se puede descartar la hipótesis de interacción y testear la significatividad de edad al mismo tiempo, así:

anova(
  lm(reducccol ~ tratamiento.f, data = colesterol),
  lm(reducccol ~ tratamiento.f + edad, data = colesterol),
  lm(reducccol ~ tratamiento.f + edad + tratamiento.f * edad, data = colesterol) # es igual a `tratamiento * edad`
)

Res.Df	RSS	Df	Sum of Sq	F	Pr(>F)
38	2564.3755	NA	NA	NA	NA
37	730.1896	1	1834.185945	90.7415941	0.0000000
36	727.6784	1	2.511214	0.1242358	0.7265403

7 Análisis de los residuos

7.1 Buscar “estructura” en los residuos con un plot

housing <-
  readxl::read_xls("Housing.xls") %>%
  rowid_to_column("obs_id")

modelo_housing <- lm(STARTS ~ POP, data = housing)

ggplot(data = augment(modelo_housing, data = housing)) +
  aes(x = obs_id, y = .std.resid) +
  geom_hline(yintercept = 0, color = "Gray") +
  geom_point() +
  geom_smooth(se = F) +
  geom_line(aes(group = 1)) +
  labs(title = "Residuos estandarizados en el tiempo",
       y = "Residuos estandarizados",
       x = "Nro de observación")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

7.2 Residuos estandarizados vs. estudentizados

MASS::studres te da los residuos (externamente) estudentizados con el modelo:

augment(modelo_housing) %>%
  mutate(.stud.resid = MASS::studres(modelo_housing)) %>%
  ggplot() +
  aes(x = .std.resid, y = .stud.resid) +
  geom_segment(aes(xend = .std.resid, yend = .std.resid), color = "Red") +
  geom_abline(color = "Gray") +
  geom_point() +
  labs(title = "Residuos estandarizados vs. estudentizados")

7.3 Estadístico d de Durbin-Watson

\[ H_{0}: ρ = 0 \quad \text{ vs. } \quad H_{1}: ρ \neq 0 \\ \text{ Estadistico d de DW: } \\ d = \frac{ \sum_{i=2}^{n} (e_{i} - e_{i-1})^2 }{ \sum_{i=1}^{n} e_{i}^2 } \\ d \approx 2(1 - \hat{\rho}) \]

\[ d = 4 \implies \hat{\rho} = -1 \\ d = 0 \implies \hat{\rho} = 1 \\ d = 2 \implies \hat{\rho} = 0 \]

# Cálculo "a mano" del estadístico d o "DW"
n <- nrow(housing)
residuos <- modelo_housing$residuals
numerador_DW <- sum(diff(residuos, lag = 1)^2)
denominador_DW <- sum(residuos^2)
estadistico_DW <- numerador_DW / denominador_DW
estadistico_DW

## [1] 0.6208403

Comparar el valor de d con la regla de arriba.

7.4 Test Durbin-Watson para errores correlacionados

lmtest::dwtest toma el modelo y nos da el p-valor para el test de si la correlación entre errores ($\rho$) es diferente que cero (o mayor que cero, se puede testear también). Un resultado significativo quiere decir que HAY correlación entre los errores.

Acá se ve que un R^2 alto no implica que no haya otros problemas, como el de correlación de los errores.

paste0("R^2 del modelo housing: ", summary(modelo_housing)$r.squared %>% round(2))

## [1] "R^2 del modelo housing: 0.93"

dw_test_housing <- lmtest::dwtest(modelo_housing, alternative = "two.sided",
                                  data = housing)
dw_test_housing

## 
##  Durbin-Watson test
## 
## data:  modelo_housing
## DW = 0.62084, p-value = 1.329e-05
## alternative hypothesis: true autocorrelation is not 0

7.5 Estimar el rho de los errores

Se quiere estimar cuán correlacionado está cada error con el error anterior.

\[ \hat{\rho} = \frac{ \sum^{n}_{i=2} e_{i} e_{i-1} }{ \sum^{n}_{i=2} e^{2}_{i} } \]

estimar_rho_sombrero <- function(residuos) {
  n <- length(residuos)
  numerador <- sum(head(residuos, n-1) * tail(residuos, n-1))
  denominador <- sum(residuos^2)
  return (numerador/denominador)
}

rho_estimado <- estimar_rho_sombrero(modelo_housing$residuals)

7.6 “Descorrelacionar” las variables usando rho y regresar

\[ \text{Modelo original: } Y_{t} = \alpha + \beta X_{t} + \varepsilon_{t} \\ \varepsilon_{t} = \rho \varepsilon_{t-1} + u_{t} \\ \text{Transformacion: } Y_{t} - \rho Y_{t-1} \\ ... \\ Y^{*}_{t} = Y_{t} - \hat{\rho} Y_{t-1} \\ X^{*}_{t} = X_{t} - \hat{\rho} X_{t-1} \\ \text{Modelo nuevo: } Y^{*}_{t} = \alpha^{*} + \beta^{*} X_{t}^{*} + u_{t} \\ u_{t} \sim \mathcal{N}(0, \sigma^2) \\ \alpha^{*} = \alpha (1 - \rho) \\ \beta^{*} = \beta \]

moneda <- readxl::read_xls("Moneda.xls")
modelo_moneda <- lm(gasto ~ stock, data = moneda)

lmtest::dwtest(modelo_moneda)

## 
##  Durbin-Watson test
## 
## data:  modelo_moneda
## DW = 0.32821, p-value = 2.303e-08
## alternative hypothesis: true autocorrelation is greater than 0

rho_estimado <- estimar_rho_sombrero(modelo_moneda$residuals)
moneda %<>% mutate(
  `gasto*` = gasto - rho_estimado * lag(gasto),
  `stock*` = stock - rho_estimado * lag(stock)
)

modelo_moneda_descorrelacionado <- lm(`gasto*` ~ `stock*`, data = moneda)
lmtest::dwtest(modelo_moneda_descorrelacionado)

## 
##  Durbin-Watson test
## 
## data:  modelo_moneda_descorrelacionado
## DW = 1.426, p-value = 0.05786
## alternative hypothesis: true autocorrelation is greater than 0

alfa_estrella <- modelo_moneda_descorrelacionado$coefficients[[1]]
beta_estrella <- modelo_moneda_descorrelacionado$coefficients[[2]]

alfa_original <- alfa_estrella/(1 - rho_estimado)
beta_original <- beta_estrella # Son el mismo

bind_rows(
  tidy(modelo_moneda$coefficients) %>% mutate(modelo = "original"),
  tidy(modelo_moneda_descorrelacionado$coefficients) %>% mutate(modelo = "transformado*"),
  tibble(names = c("(Intercept)", "gasto*"),
         x = c(alfa_original, beta_original),
         modelo = rep("original_descorrelacionado", 2))
)

## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")

## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")

names	x	modelo
(Intercept)	-154.719162	original
stock	2.300371	original
(Intercept)	-53.695919	transformado*
`stock*`	2.643443	transformado*
(Intercept)	-215.310969	original_descorrelacionado
gasto*	2.643443	original_descorrelacionado

7.7 Test de Shapiro-Wilk para testear normalidad de los errores

Si da muy significativo, es pista de que los errores NO tienen distribución normal! (H0 es que los errores son normales)

# shapiro.test(modelo_moneda$residuals)

# Mejor usar los residuos estandarizados:
shapiro.test(augment(modelo_moneda)$.std.resid)

## 
##  Shapiro-Wilk normality test
## 
## data:  augment(modelo_moneda)$.std.resid
## W = 0.92539, p-value = 0.1259

7.8 QQ-plot de residuos estandarizados para testear normalidad de los errores

ggplot(data = augment(modelo_moneda, data = moneda)) +
  aes(sample = .std.resid) +
  geom_qq(color = "Red") +
  geom_qq_line(color = "SeaGreen") +
  labs(title = "Comparación de cuantiles",
       x = "Cuantiles teóricos\n(distribución normal)",
       y = "Residuos estandarizados\n(distribución empírica)")

7.9 Box-plot de los residuos para ubicar outliers

salud <-
  read_delim("salud.txt", delim = " ") %>%
  rename(peso = X1, pulso = X2, fuerza = X3, t_cuarto_milla = X4, t_una_milla = Y)

## Parsed with column specification:
## cols(
##   obs = col_integer(),
##   X1 = col_integer(),
##   X2 = col_integer(),
##   X3 = col_integer(),
##   X4 = col_integer(),
##   Y = col_integer()
## )

modelo_salud <- lm(t_una_milla ~ peso + pulso + fuerza + t_cuarto_milla, data = salud) 

salud <- augment(modelo_salud, data = salud)

ggplot(data = salud) +
  aes(y = .std.resid, x = "") +
  geom_boxplot() +
  geom_rug(sides = "t", color = "Red") +
  geom_point() +
  geom_label_repel(
    data = top_n(salud, 10, abs(.std.resid)),
    mapping = aes(label = obs),
    nudge_x = .3
  ) +
  coord_flip() +
  labs(title = "Residuos estandarizados")

7.10 Box-plot de los leverages de cada observación

\[ \bigg( X (X^T X)^{-1} X^T \bigg)_{ii} = \bigg( P_{\mathcal{C}(X)} \bigg)_{ii} = x_{i}^T (X^T X)^{-1} x_{i} = p_{ii} \text{ : leverage de la observación i} \]

p <- modelo_salud$rank
n <- length(salud)
pii_umbral <- 2 * p / n

ggplot(data = salud) +
  aes(x = "", y = .hat) +
  geom_boxplot(width = .5) +
  geom_rug(sides = "b", color = "Red") +
  geom_label_repel(mapping = aes(label = (obs)), data = top_n(salud, 10, .hat), nudge_x = .3) +
  geom_hline(yintercept = pii_umbral, linetype = "dashed", color = "Red") +
  geom_point() +
  coord_flip() +
  labs(title = "Leverage de las observaciones",
       subtitle = "Y umbral de 2p/n")

7.11 Plot L-R (Leverage vs. Residuos)

\[ a_{i} = \frac{ e_{i} }{ \sqrt{e^T e}} \\ \text{Graficar: } a_{i}^2 \text{ vs. } p_{ii} \]

norma_del_vector_residuos <- sqrt(sum(modelo_salud$residuals^2))

salud %>%
  mutate(a = .resid / norma_del_vector_residuos) %>%
  ggplot() +
  aes(x = a^2, y = .hat, color = .cooksd, size = .cooksd, alpha = .cooksd) +
  geom_label(aes(label = obs), alpha = .5) +
  labs(title = "Plot L-R",
       subtitle = "Leverage vs. Residuos") +
  scale_color_gradient(low = "Gray", high = "Red") +
  guides(size = F)

Analizar casos diferentes: - Obs. 28: residuo grande, leverage alto, Cooks D máximo. Es una observación problematica! - Obs. 23: leverage alto, pero residuo chico. Es probable que el ajuste por LS se esté adaptando a esta observación de alto leverage, distorsionando los betas. Problemático! - Obs. 30: leverage bajo, residuo alto: parece una observación que el modelo “entiende” como lejana a la media, sin que sea un outlier.

7.12 Encontrar observaciones atípicas

\[ p_{ii} > 2 \frac{p}{n} \rightarrow \text{Problema} \\ \text{Distancia de Cook} = D_{i} = r_{i} \frac{h_{i}}{1 - h_{i}} \ge \mathcal{F}^{0.5}_{p, n-p} \rightarrow \text{Problema} \\ \]

p <- modelo_salud$rank
n <- length(salud)

pii_umbral <- 2 * p / n
Di_umbral <- qf(p = 0.5, df1 = p, df2 = n - p)

ggplot(data = salud) +
  geom_point(aes(x = .hat, y = .cooksd)) +
  gghighlight(percent_rank(.cooksd) > .75, label_key = obs) +
  geom_hline(yintercept = Di_umbral, color = "Tomato") +
  geom_vline(xintercept = pii_umbral, color = "Tomato") +
  labs(title = "Observaciones atípicas",
       subtitle = "En rojo los 'umbrales' de Di y Pii problemáticos",
       x = "Leverage", y = "Distancia de Cook")

ggplot(salud) +
  aes(x = "", y = .cooksd) +
  geom_boxplot() +
  geom_hline(yintercept = Di_umbral, color = "tomato") +
  coord_flip() +
  labs(title = "Distancia de Cook",
       subtitle = "Y umbral problemático = F(0.5, p, n-p)")

7.13 Detectar heterocedasticidad en distintos grupos de Yi

gorgojo <-
  readxl::read_xls("Gorgojo.xls") %>%
  mutate(x = factor(x))

ggpairs(gorgojo, progress = F)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gorgojo %>% group_by(x) %>% summarise(n(), var(y), sd(y))

x	n()	var(y)	sd(y)
3	6	29.86667	5.465040
6	6	102.96667	10.147249
9	6	10.96667	3.311596
12	6	17.86667	4.226898
21	6	14.80000	3.847077

En la densidad empírica Y no parece normal. Además, parece haber varianzas diferentes entre niveles del factor.

7.14 Transformaciones Box-Cox para corregir heterocedasticidad y/o no linealidad

Las transformaciones Box-Cox se usan para “orregir varianzas desiguales y principalmente para corregir la no linealidad”.

\[ Y^{(\lambda)} = \left \{ \begin{array}{ll} \frac{Y^{\lambda} - 1}{\lambda} \quad \text{si } \lambda \neq 0 \\ \mathrm{ln}(Y) \quad \text{si } \lambda = 0 \end{array} \right. \]

7.14.1 Encontrar el lambda para Box-Cox

# No importar MASS porque pisa select :(

modelo_gorgojo <- lm(y ~ x, data = gorgojo)
MASS::boxcox(modelo_gorgojo)

lambdas <- EnvStats::boxcox(modelo_gorgojo)

lambda_probs <- tibble(
  lambda = lambdas$lambda,
  prob = lambdas$objective
)

ggplot(data = lambda_probs) +
  aes(x = lambda, y = prob) +
  geom_point() +
  geom_line() +
  labs(title = "Lambda óptimo para Box-Cox",
       subtitle = "Según paquete EnvStats") +
  geom_label_repel(data = top_n(lambda_probs, 3, prob),
                   mapping = aes(label = lambda),
                   nudge_y = -.025)

gorgojo %<>% mutate(
  `Y(lambda=-1)` = (y^(-1) - 1)/(-1),
  `Y(lambda=-0.5)` = (y^(0.5) - 1)/0.5,
  `Y(lambda=0)` = log(y)
)

tidy_gorgojo <- gorgojo %>% gather("Y_name", "Y_value", -x)

ggplot(tidy_gorgojo) +
  facet_grid(Y_name ~ ., scales = "free") +
  aes(x = x, y = Y_value) +
  geom_boxplot() +
  labs(title = "Transformaciones Box-Cox")

modelo <- lm(y ~ x, data = gorgojo)
modelo_lambda_0 <- lm(log(y) ~ x, data = gorgojo)
modelo_lambda_minus05 <- lm(`Y(lambda=-0.5)` ~ x, data = gorgojo)

gorgojo_comparacion <- tibble(
  .std.resid = c(rstandard(modelo),
                 rstandard(modelo_lambda_0),
                 rstandard(modelo_lambda_minus05)),
  .fitted = c(modelo$fitted.values,
              modelo_lambda_0$fitted.values,
              modelo_lambda_minus05$fitted.values),
  obs = rep(1:nrow(gorgojo), times = 3),
  modelo = rep(c("original", "log_y", "lambda = -0.5"), each = nrow(gorgojo))
)

ggplot(gorgojo_comparacion) +
  aes(x = obs, y = .std.resid, fill = modelo, color = modelo) +
  geom_hline(yintercept = 0, color = "Gray") +
  geom_smooth(se = F, size = .5) +
  labs(title = "Residuos Y vs. Y(Box-Cox)",
       subtitle = "Lambda = -0.5 parece mejorar algo el asunto") +
  scale_color_manual(values = c("Blue", "Red", "ForestGreen"))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

8 Ridge regression & Lasso

8.1 Un dataset de alta colinealidad

gdp <- read_csv("rdpercentGDP.csv") %>% rename(obs_id = X1)

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   year = col_integer(),
##   argen = col_double(),
##   germany = col_double(),
##   china = col_double(),
##   japan = col_double(),
##   france = col_double(),
##   usa = col_double(),
##   uk = col_double(),
##   nld = col_double(),
##   finland = col_double(),
##   netherlands = col_double()
## )

cor(select(gdp, -obs_id, -year)) %>%
  as_tibble() %>%
  mutate(country_1 = names(.)) %>%
  gather("country_2", "corr", -country_1) %>%
  ggplot() +
  aes(y = country_1, x = country_2, fill = corr) +
  geom_tile(width = .95) +
  scale_fill_gradient2(low = "Blue", mid = "White", high = "Red") +
  geom_text(aes(label = round(corr, 2)))

8.2 Autovalores y número de condición para detectar colinearidad

modelo_gdp <- lm(usa ~ .,data = select(gdp, -obs_id))
X_ <- model.matrix(modelo_gdp)
autovalores <- eigen(t(X_) %*% X_)$values
nro_de_condicion <- max(autovalores)/min(autovalores)

tibble(autovalor = autovalores) %>%
  rowid_to_column("rank") %>%
  ggplot() +
  aes(x = rank, y = autovalor) +
  geom_point() +
  geom_line() +
  labs(title = glue("Autovalores de la matriz (Xt * X)"),
       subtitle = glue("Número de condición: {round(nro_de_condicion)}"))

8.3 Ajuste ridge (alpha = 0)

glmnet toma la matriz de diseño y la variable dependiente por separado. Graficamos con log(lambda) porque con lambda es ilegible.

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded glmnet 2.0-18

X_gdp <- model.matrix(modelo_gdp)

# alpha = 0 es ajuste ridge, alpha = 1 es ajuste lasso
ajuste_ridge <- glmnet(X_gdp, gdp$usa, alpha = 0) %>% tidy()

ajuste_ridge %>%
  filter(term != "(Intercept)") %>%
  ggplot() +
  aes(x = log(lambda), y = estimate, group = term, color = term) +
  geom_line() +
  labs(title = "Lambdas de Ridge y sus betas estimados",
       color = "Covariable",
       y = "Coeficientes")

Lambda = 0 implica no penalizar la norma de $\beta$, de modo que se recupera el $\beta_{\text{OLS}}$, esimador de mínimos cuadrados:

ajuste_ridge_lambda_0 <- glmnet(X_gdp, gdp$usa, lambda = 0)

gdp %>% select(-obs_id, -year) %>% lm(usa ~ ., data = .)

## 
## Call:
## lm(formula = usa ~ ., data = .)
## 
## Coefficients:
## (Intercept)        argen      germany        china        japan  
##    0.400959     0.291570     0.653356    -0.204354     0.102843  
##      france           uk          nld      finland  netherlands  
##   -0.001821     0.746867    -0.396645    -0.032093           NA

ajuste_ols <- lm(usa ~ ., data = select(gdp, -obs_id))
ajuste_ridge_lambda_0 <- glmnet(model.matrix(ajuste_ols), gdp$usa, lambda = 0)

ajuste_ols$coefficients

##   (Intercept)          year         argen       germany         china 
## -2.889575e+01  1.470147e-02  3.174547e-01  5.669843e-01 -3.562079e-01 
##         japan        france            uk           nld       finland 
##  8.365393e-02  8.698457e-02  6.917669e-01 -3.546900e-01 -6.828865e-04 
##   netherlands 
##            NA

coef.glmnet(ajuste_ridge_lambda_0)[, 1]

##   (Intercept)   (Intercept)          year         argen       germany 
## -35.178501898   0.000000000   0.017856095   0.325324659   0.544470286 
##         china         japan        france            uk           nld 
##  -0.387198046   0.078691566   0.104895992   0.682955470  -0.353677131 
##       finland   netherlands 
##   0.006381637   0.009237243

8.4 Elegir el lambda para Ridge por CV

Se puede elegir el lambda para Ridge con dos métodos:

El lambda que minimiza el error de predicción.
El lambda que minimiza el error “con el criterio de 1SE”

ridge_crossval <- cv.glmnet(X_gdp, gdp$usa, nfolds = 5, alpha = 0)

glance(ridge_crossval)

lambda.min	lambda.1se
0.0099478	0.0639454

plot(ridge_crossval)

8.5 Ajuste Ridge usando el lambda elegido

ridge_ols <- glmnet(X_gdp, gdp$usa, alpha = 0, lambda = 0)
ridge_1se <- glmnet(X_gdp, gdp$usa, alpha = 0, lambda = ridge_crossval$lambda.1se)
ridge_min <- glmnet(X_gdp, gdp$usa, alpha = 0, lambda = ridge_crossval$lambda.min)

ridge_coefficients <- bind_rows(
  tidy(ridge_ols) %>% mutate(criterio = "OLS"),
  tidy(ridge_1se) %>% mutate(criterio = "1SE"),
  tidy(ridge_min) %>% mutate(criterio = "Min")
)

8.6 Lasso

Elegimos el lambda de la misma manera que con Ridge: por CV con uno de los dos criterios.

Como Lasso mata covariables (lleva sus betas a cero), arriba aparece el número de cuántas covariables “conserva” con cada lambda.

# Alpha = 1 implica usar lasso
lasso_crossval <- cv.glmnet(X_gdp, gdp$usa, nfolds = 5, alpha = 1)

glance(lasso_crossval)

lambda.min	lambda.1se
0.0013777	0.0055616

plot(lasso_crossval)

ajuste_lasso <-
  glmnet(X_gdp, gdp$usa, alpha = 1) %>%
  tidy() %>%
  filter(term != "(Intercept)")

ultimo_estimado <-
  ajuste_lasso %>%
  group_by(term) %>%
  top_n(1, lambda)

ajuste_lasso %>%
  ggplot() +
  aes(x = log(lambda), y = estimate, group = term, color = term) +
  geom_line() +
  geom_point(data = ultimo_estimado, size = 3) +
  geom_hline(yintercept = 0) +
  labs(title = "Evolución de los betas estimados",
       subtitle = "al cambiar lambda de Lasso",
       color = "Covariable",
       y = "Coeficientes")

lasso_1se <- glmnet(X_gdp, gdp$usa, alpha = 1, lambda = lasso_crossval$lambda.1se)
lasso_min <- glmnet(X_gdp, gdp$usa, alpha = 1, lambda = lasso_crossval$lambda.min)
lasso_ols <- glmnet(X_gdp, gdp$usa, alpha = 1, lambda = 0)

lasso_coefficients <- bind_rows( 
  tidy(lasso_ols) %>% mutate(criterio = "OLS"),
  tidy(lasso_1se) %>% mutate(criterio = "1SE"),
  tidy(lasso_min) %>% mutate(criterio = "Min")
)

bind_rows(
    ridge_coefficients %>% mutate(metodo = "Ridge"),
    lasso_coefficients %>% mutate(metodo = "Lasso")
  ) %>%
  mutate(
    is_ols = ifelse(lambda == 0, "OLS", "Con penalización"),
    term = factor(term),
    lambda = paste0(criterio, " Lambda = ", round(lambda, 3))
  ) %>%
  filter(term != "(Intercept)") %>%
  ggplot() +
  facet_wrap(metodo ~ lambda, ncol = 3, scales = "free_y") +
  aes(x = term, y = estimate, color = metodo, shape = is_ols) +
  geom_point(size = 2.5) +
  labs(title = "Coeficientes por Ridge y Lasso",
       subtitle = "Lasso 'mata' muchas covariables. Ridge no.\nAmbos pueden recuperar OLS con lambda = 0.",
       y = "Beta estimado", x = "Covariable",
       shape = "OLS vs. Penalización") +
  geom_hline(yintercept = 0) +
  coord_flip() +
  scale_shape_manual(values = c(16, 8)) +
  theme(legend.position = "top", legend.justification = "left", legend.direction = "vertical")

9 Ajuste Robusto para identificar obs atípicas

library(RobStatTM)

## Loading required package: fit.models

data(wood, package='robustbase') # Carga el dataset `wood`

# lmrobdet.control() setea parámetros para el ajuste. Confiamos en los defaults.
wood_LS <- lm(y ~ ., data = wood)
wood_MM <- lmrobdetMM(y ~ ., data = wood, control = lmrobdet.control())

X_wood <- model.matrix(wood_MM)
P_ <- X_wood %*% solve(t(X_wood) %*% X_wood) %*% t(X_wood)

wood_LS_augmented <-
  augment(wood_LS, data = wood) %>%
  rowid_to_column("i")

wood_MM_augmented <-
  wood %>%
  mutate(.fitted = model.matrix(wood_MM) %*% wood_MM$coefficients,
         .resid = y - .fitted,
         i = 1:nrow(wood),
         .hat = map_dbl(i, ~ P_[[.x, .x]]),
         .std.resid = .resid/(sigma(wood_MM)*sqrt(1 - .hat)))

comparacion_LS_MM <-
  bind_rows(
    wood_LS_augmented %>% select(i, .fitted, .std.resid) %>% mutate(metodo = "LS"),
    wood_MM_augmented %>% select(i, .fitted, .std.resid) %>% mutate(metodo = "MM")
  )

obs_atipicas <-
  comparacion_LS_MM %>%
  filter(metodo == "MM") %>%
  top_n(4, abs(.std.resid)) # Ya miré que son 4 ;)

ggplot(data = comparacion_LS_MM) +
  aes(x = .fitted, y = .std.resid, color = metodo) +
  geom_point() +
  geom_segment(aes(xend = .fitted, yend = 0)) +
  geom_hline(yintercept = 0) +
  labs(title = "Residuos estandarizados: LS vs MM") +
  geom_label_repel(data = comparacion_LS_MM %>% filter(i %in% obs_atipicas$i),
                   mapping = aes(label = i),
                   alpha = .65)

comparacion_LS_MM %>%
  select(i, metodo, .std.resid) %>%
  spread(key = "metodo", value = ".std.resid") %>%
  rename(LS_std_resid = LS, MM_std_resid = MM) %>%
  filter(!(i %in% c(4, 6, 8, 19))) %>%
  ggplot() +
  aes(x = sort(abs(LS_std_resid)), y = sort(abs(MM_std_resid))) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0, 2.5) +
  ylim(0, 2.5) +
  labs(title = "Residuos de MM mucho menores*",
       subtitle = "*Para las observaciones no-outliers")

9.1 Comparación QQ-plot LS vs. MM

comparacion_LS_MM %>%
  ggplot() +
  facet_wrap(metodo ~ ., scales = "free") +
  aes(sample = .std.resid) +
  aes(color = i %in% obs_atipicas$i) +
  geom_qq() +
  geom_qq_line(color = "SeaGreen") +
  labs(title = "QQ-plots para LS vs MM",
       x = "Cuantiles teóricos\n(distribución normal)",
       y = "Cuantiles de los residuos estandarizados\n(distribución empírica)") +
  guides(color = F)

9.2 Ajuste WLS con los pesos del robusto

El ajuste robusto le da peso 0 a las observaciones atípicas. Usamos esos pesos para rehacer el ajuste LS:

pesos <- wood_MM$rweights
tibble(obs = names(pesos), peso = pesos) %>%
  ggplot() +
  aes(x = fct_reorder(obs, as.integer(obs)), y = peso, color = peso > 0) +
  geom_hline(yintercept = 0, color = "Gray") +
  geom_hline(yintercept = 1, color = "Gray") +
  geom_point() +
  geom_segment(aes(xend = obs, yend = 0)) +
  labs(title = "Pesos de las observaciones",
       subtitle = "Según el ajuste robusto",
       x = "Observación") +
  guides(color = F)

# Weighted Least Squares
wood_WLS <- lm(y ~ ., data = wood, weights = wood_MM$rweights)

bind_rows(
    glance(wood_LS) %>% mutate(metodo = "OLS"),
    glance(wood_WLS) %>% mutate(metodo = "WLS")
  ) %>%
  select(metodo, p.value, df.residual, adj.r.squared)

metodo	p.value	df.residual	adj.r.squared
OLS	0.0001282	14	0.7399743
WLS	0.0000023	10	0.9307745

9.3 Comparación de rectas: rlm (MASS) vs. lm

Acá pruebo el lm robusto de MASS:

llamadas <- read_delim("llamadas.txt", delim = " ")

## Parsed with column specification:
## cols(
##   obs = col_integer(),
##   year = col_integer(),
##   calls = col_double()
## )

llamadas_OLS <- lm(calls ~ year, data = llamadas)
llamadas_Rob <- MASS::rlm(calls ~ year, data = llamadas)

## Warning in rlm.default(x, y, weights, method = method, wt.method =
## wt.method, : 'rlm' failed to converge in 20 steps

comparacion_OLS_Rob <- bind_rows(
    augment(llamadas_OLS) %>% mutate(metodo = "OLS"),
    augment(llamadas_Rob) %>% mutate(metodo = "robusto")
  ) %>%
  mutate(weight = rep(llamadas_Rob$w, times = 2))

comparacion_OLS_Rob %>%
  ggplot() +
  aes(x = year, y = calls, color = metodo) +
  geom_point() +
  geom_line(aes(y = .fitted, group = metodo)) +
  geom_label_repel(
    data = filter(comparacion_OLS_Rob, weight < 1 & metodo == "robusto"),
    mapping = aes(label = round(weight, 2))
  ) +
  labs(title = "MASS::rlm vs lm",
       subtitle = "Se detallan los pesos < 1 dados por rlm a las observaciones")

10 Selección de variables, R^2 adj, Cp de Mallows

La idea es testear algunos (o todos los) subconjuntos de covariables y quedarse con el que tenga mejor R^2 ajustado y el menor número de variables, entre aquellos donde el Cp de Mallows no es muy diferente de p. El método para elegir los subconjuntos de covariables puede cambiar: stepwise, back, forw, o ASR.

10.1 Forward

biomasa <- readxl::read_excel("Biomasa.xls")
forw <- leaps::regsubsets(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, method = "forward")
forw_result <- summary(forw)
forw_result

## Subset selection object
## Call: regsubsets.formula(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, 
##     method = "forward")
## 5 Variables  (and intercept)
##       Forced in Forced out
## K         FALSE      FALSE
## SODIO     FALSE      FALSE
## PH        FALSE      FALSE
## SAL       FALSE      FALSE
## ZN        FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: forward
##          K   SODIO PH  SAL ZN 
## 1  ( 1 ) " " " "   "*" " " " "
## 2  ( 1 ) " " "*"   "*" " " " "
## 3  ( 1 ) " " "*"   "*" " " "*"
## 4  ( 1 ) " " "*"   "*" "*" "*"
## 5  ( 1 ) "*" "*"   "*" "*" "*"

forward_selection <- tibble(
  rss = forw_result$rss,
  r_squared = forw_result$rsq,
  r_squared_adj = forw_result$adjr2,
  Cp = forw_result$cp,
  p = 1:length(Cp)
)

ggplot(forward_selection) +
  aes(x = p, y = Cp) +
  geom_point() +
  geom_line() +
  geom_abline(intercept = 0, slope = 1) +
  labs(title = "Cp de Mallows")

forward_selection %>% 
  select(p, r_squared, r_squared_adj, rss) %>%
  gather("name", "r2_value", -p, -rss) %>%
  ggplot() +
  aes(x = p, y = r2_value, color = name, group = name) +
  geom_point() +
  geom_line() +
  theme(legend.position = "top") +
  labs(title = "R2 y R2 ajustado eligiendo variables",
       subtitle = "Método: forward selection",
       color = "", y = "")

10.2 ASR: All subsets regression

En este caso, buscamos el Cp y R^2 de todos los subconjuntos de covariables posibles!

ASR_call <- leaps::leaps(x = select(biomasa, K, SODIO, PH, SAL, ZN), y = biomasa$BIO, method = "Cp")

ASR_result <-
  tibble(
    model_size = ASR_call$size,
    cp = ASR_call$Cp
  ) %>%
  rowid_to_column("i") %>%
  mutate(
    which = map(i, ~ ASR_call$which[.x, ]),
    names = map(which, ~ c("K", "SODIO", "PH", "SAL", "ZN")[.x]),
    names_peek = map_chr(names, ~ str_c(.x, collapse = ", "))
  )

lm_biomasa <- function(.names) {
  .formula <- paste0("BIO ~ ", paste0(.names, collapse = " + "))
  return (lm(.formula, data = biomasa))
}

ASR_result %<>%
  mutate(modelo = map(names, lm_biomasa),
         summary = map(modelo, summary),
         r_squared_adj = map_dbl(summary, ~ .x$adj.r.squared),
         p_value = map_dbl(modelo, ~ glance(.x)[[1, "p.value"]]))

mejores_subconjuntos <-
  ASR_result %>%
  group_by(model_size) %>%
  filter(p_value <= 0.05) %>% # Que sea significativo
  filter(abs(cp - p) < 2) %>% # Que el Cp esté cerca de p
  top_n(1, r_squared_adj) %>% # El mejor ajuste para cada p
  mutate(model_name = paste0("R2.adj = ", round(r_squared_adj, 3), " # ", names_peek))

ggplot(mejores_subconjuntos) +
  aes(x = model_size, y = cp, color = model_name) +
  geom_point(size = 3) +
  geom_abline() +
  theme(legend.position = "top", legend.direction = "vertical", legend.justification = "left") +
  geom_text_repel(aes(label = round(r_squared_adj, 3)), nudge_y = -1.5) +
  xlim(2, 7) + ylim(2, 7.5) +
  geom_label_repel(aes(label = names_peek), nudge_y = 1) +
  labs(title = "Los mejores modelos en ASR", color = NULL, x = "p", y = "Cp",
       subtitle = "Significativos, Cp =~ p, top R2.adj")

# Si consideramos TODOS los subconjuntos de variables posibles, hay algunos con Cp muy diferente de p (malo!)

tibble(p = ASR_result$model_size, Cp = ASR_result$cp) %>%
  ggplot() +
  aes(x = p, y = Cp) +
  geom_point() +
  geom_abline()

10.3 Backward & Stepwise

back <- leaps::regsubsets(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, method = "backward")
stepwise <- leaps::regsubsets(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, method = "seqrep")

Modelo Lineal - Machete R

JM Berros

June 29, 2019