library(tidyverse)
library(magrittr) # %<>%
library(glue) # glue
library(broom) # augment, glance, tidy
library(skimr) # skim
library(GGally) # ggpairs
library(gghighlight) # gghighlight
library(ggrepel) # geom_label_repel
# library(tidylog) # loguea mensajitos cuando usás dplyr
library(ggridges) # geom_density_ridges: me gusta más comparar densidades que boxplots
fn <- "low birth weight infants.txt"
bebes <- read_table(fn, col_types = cols(toxemia = col_factor(NULL)))
glimpse(bebes)
## Observations: 100
## Variables: 6
## $ headcirc <int> 27, 29, 30, 28, 29, 23, 22, 26, 27, 25, 23, 26, 27, 27,…
## $ length <int> 41, 40, 38, 38, 38, 32, 33, 38, 30, 34, 32, 39, 38, 39,…
## $ gestage <int> 29, 31, 33, 31, 30, 25, 27, 29, 28, 29, 26, 30, 29, 29,…
## $ birthwt <int> 1360, 1490, 1490, 1180, 1200, 680, 620, 1060, 1320, 830…
## $ momage <int> 37, 34, 32, 37, 29, 19, 20, 25, 27, 32, 26, 29, 24, 26,…
## $ toxemia <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1…
# skim(bebes)
bebes %>% select_if(is.numeric) %>% cor() %>% round(3)
## headcirc length gestage birthwt momage
## headcirc 1.000 0.713 0.781 0.799 0.132
## length 0.713 1.000 0.675 0.816 0.218
## gestage 0.781 0.675 1.000 0.660 0.266
## birthwt 0.799 0.816 0.660 1.000 0.155
## momage 0.132 0.218 0.266 0.155 1.000
ggpairs(bebes, progress = F)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
$$ H_{0}: _{j} \ F =
{p-1,n-p} H{0}) $$
El \(F\) es el test para toda la regresión. Los \(t\) son tests independientes para ver si cada variable, al agregarse por separado, reduce significativamente el RSS.
modelo <- lm(headcirc ~ ., data = bebes) # . usa todas las variables
summary(modelo)
##
## Call:
## lm(formula = headcirc ~ ., data = bebes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0190 -0.6712 -0.0364 0.3334 8.0421
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2097216 2.1285705 3.387 0.00103 **
## length 0.0082711 0.0653434 0.127 0.89954
## gestage 0.5261922 0.0835553 6.298 9.62e-09 ***
## birthwt 0.0042555 0.0008867 4.799 5.99e-06 ***
## momage -0.0300651 0.0222312 -1.352 0.17950
## toxemia1 -0.5160581 0.3696445 -1.396 0.16597
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.269 on 94 degrees of freedom
## Multiple R-squared: 0.7615, Adjusted R-squared: 0.7488
## F-statistic: 60.03 on 5 and 94 DF, p-value: < 2.2e-16
modelo_cars <- lm(dist ~ speed, data = cars)
ggplot(data = augment(modelo_cars)) +
aes(x = speed) +
geom_segment(aes(xend = speed, y = .fitted, yend = dist), color = "Tomato", alpha = .5) +
geom_point(mapping = aes(y = dist)) +
# geom_point(mapping = aes(y = .fitted), shape = 4, color = "Red") +
geom_abline(intercept = modelo_cars$coefficients[[1]],
slope = modelo_cars$coefficients[[2]],
color = "Tomato") +
labs(title = "Regresión lineal con una sola covariable")
Compara dos modelos, uno con todas las variables disponibles, otro con un subset:
modelo_1 <- lm(headcirc ~ gestage + birthwt + momage + length + toxemia, data = bebes)
modelo_2 <- lm(headcirc ~ gestage + birthwt, data = bebes)
comparacion <- anova(modelo_1, modelo_2)
comparacion
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
94 | 151.3779 | NA | NA | NA | NA |
97 | 157.4231 | -3 | -6.045176 | 1.251276 | 0.2957051 |
# Es igual que hacer:
RSS_H <- sum(modelo_2$residuals^2)
RSS <- sum(modelo_1$residuals^2)
n <- length(modelo_1$residuals)
p <- modelo_1$rank
q <- modelo_1$rank - modelo_2$rank
F_obs <- ( (RSS_H - RSS)/RSS ) * ( (n - p)/q )
p_value <- pf(F_obs, df1 = q, df2 = n - p, lower.tail = F)
glue("q = {q}, n-p = {n-p}, F obs = {round(F_obs, 4)}, p-value = {round(p_value, 4)}")
## q = 3, n-p = 94, F obs = 1.2513, p-value = 0.2957
\[ \hat{\mathrm{Var}} [\hat{\beta}] = s^2 (X^T X)^{-1} \]
X <- model.matrix(modelo_1)
Xt_X_inv <- solve(t(X) %*% X)
var_betas <- sigma(modelo_1)^2 * Xt_X_inv
var_betas
## (Intercept) gestage birthwt momage
## (Intercept) 4.530812275 -1.102993e-01 1.228657e-03 -1.746837e-03
## gestage -0.110299335 6.981485e-03 -2.735342e-05 -2.958101e-04
## birthwt 0.001228657 -2.735342e-05 7.862980e-07 1.683570e-06
## momage -0.001746837 -2.958101e-04 1.683570e-06 4.942264e-04
## length -0.073271626 -1.352937e-03 -3.718980e-05 -1.432177e-04
## toxemia1 0.323957829 -1.581191e-02 9.620515e-05 5.322625e-05
## length toxemia1
## (Intercept) -7.327163e-02 3.239578e-01
## gestage -1.352937e-03 -1.581191e-02
## birthwt -3.718980e-05 9.620515e-05
## momage -1.432177e-04 5.322625e-05
## length 4.269765e-03 -8.247217e-05
## toxemia1 -8.247217e-05 1.366371e-01
Salvo \(\beta_{0}\), obviamente.
modelo_cars <- lm(dist ~ speed, data = cars)
modelo_cars_centrado <- lm(dist ~ scale(speed, center = T, scale = F), data = cars)
bind_rows(
tidy(modelo_cars$coefficients) %>% mutate(modelo = "sin centrar"),
tidy(modelo_cars_centrado$coefficients) %>% mutate(modelo = "centrado")
)
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
names | x | modelo |
---|---|---|
(Intercept) | -17.579095 | sin centrar |
speed | 3.932409 | sin centrar |
(Intercept) | 42.980000 | centrado |
scale(speed, center = T, scale = F) | 3.932409 | centrado |
Generalización del t-test, que sólo compara las medias de dos poblaciones.
algodon <-
tribble(
~p, ~"1", ~"2", ~"3", ~"4", ~"5",
15, 7, 7, 15, 11, 9,
20, 12, 17, 12, 18, 18,
25, 14, 18, 18, 19, 19,
30, 19, 25, 22, 19, 23,
35, 7, 10, 11, 15, 11
) %>%
gather("muestra", "resistencia", -p) %>%
select(-muestra) %>% arrange(p) %>% mutate(p = factor(p))
modelo_algodon <- lm(resistencia ~ p, data = algodon)
Primero nos interesa visualizar la diferencia de medias entre niveles del factor.
ggplot(data = algodon) +
aes(x = p, y = resistencia) +
geom_boxplot(width = .6) +
labs(title = "Diferencia de medias entre niveles del factor",
subtitle = "Exploración pre-ANOVA")
ggplot(data = algodon) +
aes(x = resistencia, y = p) +
geom_density_ridges(scale = 1, quantile_lines = T, quantiles = 2) +
labs(title = "Diferencia de mediana y dist empírica entre niveles",
subtitle = "Exploración pre-ANOVA")
## Picking joint bandwidth of 1.33
anova
te da el p-valor general de si el factor explica o no a la Y:
anova(modelo_algodon)
Df | Sum Sq | Mean Sq | F value | Pr(>F) | |
---|---|---|---|---|---|
p | 4 | 475.76 | 118.94 | 14.75682 | 9.1e-06 |
Residuals | 20 | 161.20 | 8.06 | NA | NA |
summary(lm)
te da el mismo p-valor y el p-valor para cada dummy por separado:
summary(modelo_algodon)
##
## Call:
## lm(formula = resistencia ~ p, data = algodon)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8 -2.6 0.4 1.4 5.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.800 1.270 7.719 2.02e-07 ***
## p20 5.600 1.796 3.119 0.005409 **
## p25 7.800 1.796 4.344 0.000315 ***
## p30 11.800 1.796 6.572 2.11e-06 ***
## p35 1.000 1.796 0.557 0.583753
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.839 on 20 degrees of freedom
## Multiple R-squared: 0.7469, Adjusted R-squared: 0.6963
## F-statistic: 14.76 on 4 and 20 DF, p-value: 9.128e-06
aov
no parece agregar nada nuevo:
aov(modelo_algodon)
## Call:
## aov(formula = modelo_algodon)
##
## Terms:
## p Residuals
## Sum of Squares 475.76 161.20
## Deg. of Freedom 4 20
##
## Residual standard error: 2.839014
## Estimated effects may be unbalanced
… Ver ANCOVA.Rmd
de la clase
# Comparaciones entre pares de medias por tukey para ANOVA
anova_algodon <- aov(resistencia ~ p, data = algodon)
intervalos_tukey <- TukeyHSD(anova_algodon, "p", ordered = T, conf.level = 0.95)
# plot(intervalos_tukey) # Mas rápido y más feo
tidy(intervalos_tukey) %>%
mutate(contiene_al_cero = conf.low <= 0 & conf.high >= 0) %>%
ggplot() +
aes(y = comparison, color = contiene_al_cero) +
geom_vline(xintercept = 0, color = "Red", linetype = "dotted") +
geom_point(aes(x = conf.low), shape = 91, size = 5) +
geom_point(aes(x = estimate)) +
geom_point(aes(x = conf.high), shape = 93, size = 5) +
geom_text(aes(label = comparison, x = conf.high + 2)) +
geom_segment(aes(x = conf.low, xend = conf.high, y = comparison, yend = comparison)) +
labs(title = "Intervalos de Tukey para los contrastes del ANOVA",
x = "Diferencia de medias estimada",
y = "Contraste") +
scale_color_manual(values = c("SeaGreen", "Red")) +
guides(color = F) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank())
compuestos <- read_tsv("anova_ej_2.tsv") %>%
mutate_at(vars(A, B, REP), as.factor)
## Parsed with column specification:
## cols(
## A = col_integer(),
## B = col_integer(),
## TIEMPO = col_double(),
## REP = col_integer()
## )
ggplot(compuestos) +
aes(x = A, y = TIEMPO) +
geom_boxplot()
ggplot(compuestos) +
aes(x = B, y = TIEMPO) +
geom_boxplot()
compuestos %>%
mutate(grupoAB = paste0(A, "+", B)) %>%
ggplot() +
aes(x = grupoAB, y = TIEMPO) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = "line", aes(group = A))
compuestos %>%
group_by(A, B) %>%
summarise(tiempo_medio = mean(TIEMPO)) %>%
ggplot() +
aes(x = A, y = tiempo_medio, color = B, group = B) +
geom_line() +
geom_point() +
labs(title = "Interacción entre dos factores",
subtitle = "Se cruzan las líneas: hay IX")
Ver si es significativo el F test para la interacción, A:B. En este caso NO lo es (p-valor 0.3444):
anova_compuestos <- aov(TIEMPO ~ A * B, data = compuestos)
summary(anova_compuestos)
## Df Sum Sq Mean Sq F value Pr(>F)
## A 2 6.660 3.330 79.92 4.74e-05 ***
## B 1 0.083 0.083 2.00 0.207
## A:B 2 0.107 0.053 1.28 0.344
## Residuals 6 0.250 0.042
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
IC_tukey <- TukeyHSD(anova_compuestos, c("A", "B"), ordered = T, conf.level = 0.95)
# plot(IC_tukey) # Rápido pero feo
tidy(IC_tukey) %>%
mutate(contiene_al_cero = conf.low <= 0 & conf.high >= 0) %>%
ggplot() +
facet_grid(term ~ ., scales = "free") +
aes(y = comparison, color = contiene_al_cero) +
geom_point(aes(x = estimate)) +
geom_segment(aes(x = conf.low, xend = conf.high, yend = comparison)) +
geom_vline(xintercept = 0, color = "Tomato", linetype = "dotted") +
scale_color_manual(values = c("ForestGreen", "Red")) +
guides(color = F) +
labs(title = "Tukey IC para contrastes",
subtile = "Dos factores")
El ANCOVA evalúa si las medias de Y son iguales entre niveles de un factor, controlando por los efectos de una o más variables continuas que no son de interés principal (las covariables).
colesterol <- read_csv("colesterol.txt") %>%
mutate(
tratamiento = as.logical(tratamiento),
tratamiento.f = factor(tratamiento)
)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## edad = col_integer(),
## reducccol = col_double(),
## tratamiento = col_integer()
## )
ggplot(colesterol) +
aes(x = edad, y = reducccol, color = tratamiento.f) +
geom_point() +
geom_smooth(aes(group = tratamiento.f), method = "lm", se = F)
ggplot(colesterol) +
geom_boxplot(aes(x = tratamiento.f, y = reducccol)) +
coord_flip() +
labs(title = "Diferencia de medias aparente",
subtitle = "En la variable bajo análisis")
ggplot(colesterol) +
geom_boxplot(aes(x = tratamiento, y = edad)) +
coord_flip() +
labs(title = "Diferencias entre casos y controles",
subtitle = "Para una variable distinta de la evaluada")
Acá se testea si la diferencia entre las medias de reducccol
(y) en ambas poblaciones (tratamiento vs. no tratamiento) es distinta de cero, pero sin tomar en cuenta la diferencia de edades entre los grupos tratamiento y control.
El t-test compara las medias entre dos muestras (de dos subpoblaciones, casos y controles por ej.), pero no contempla variables extra. Los tests siguientes no contemplan la edad y por lo tanto son significativos erróneamente:
reduccol_tratamiento <- colesterol %>% filter(tratamiento) %>% pull(reducccol)
reduccol_control <- colesterol %>% filter(!tratamiento) %>% pull(reducccol)
t.test(reduccol_control, reduccol_tratamiento, var.equal = T)
##
## Two Sample t-test
##
## data: reduccol_control and reduccol_tratamiento
## t = 4.0679, df = 38, p-value = 0.0002307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5.30861 15.82639
## sample estimates:
## mean of x mean of y
## 10.5770 0.0095
t.test(reduccol_control, reduccol_tratamiento, var.equal = F)
##
## Welch Two Sample t-test
##
## data: reduccol_control and reduccol_tratamiento
## t = 4.0679, df = 37.546, p-value = 0.0002342
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5.30652 15.82848
## sample estimates:
## mean of x mean of y
## 10.5770 0.0095
Este primer test nos permite descartar la hipótesis de la interacción, porque se ve que edad:tratamiento=1
no es significativo:
modelo_colesterol_ix <- lm(reducccol ~ edad * tratamiento.f, data = colesterol) # modelo con interaccion
summary(modelo_colesterol_ix)
##
## Call:
## lm(formula = reducccol ~ edad * tratamiento.f, data = colesterol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6496 -3.6753 0.1382 2.6847 11.1795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.2062 6.3317 7.771 3.28e-09 ***
## edad -0.9780 0.1583 -6.179 4.01e-07 ***
## tratamiento.fTRUE 13.8116 10.7917 1.280 0.209
## edad:tratamiento.fTRUE -0.0757 0.2148 -0.352 0.727
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.496 on 36 degrees of freedom
## Multiple R-squared: 0.8023, Adjusted R-squared: 0.7858
## F-statistic: 48.7 on 3 and 36 DF, p-value: 9.36e-13
Descartado el modelo con interacción, testeamos el modelo aditivo, que acá da bien. La edad influye mucho en reducccol y, controlando por esa variable, el tratamiento también tiene un efecto significativo!
modelo_colesterol_aditivo <- lm(reducccol ~ edad + tratamiento.f, data = colesterol) # modelo aditivo
summary(modelo_colesterol_aditivo)
##
## Call:
## lm(formula = reducccol ~ edad + tratamiento.f, data = colesterol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9990 -3.5873 0.3339 2.6313 10.9996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.8301 4.2919 11.843 3.76e-14 ***
## edad -1.0191 0.1057 -9.641 1.23e-11 ***
## tratamiento.fTRUE 10.1195 2.5648 3.946 0.000342 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.442 on 37 degrees of freedom
## Multiple R-squared: 0.8016, Adjusted R-squared: 0.7909
## F-statistic: 74.76 on 2 and 37 DF, p-value: 1.007e-13
Se puede descartar la hipótesis de interacción y testear la significatividad de edad al mismo tiempo, así:
anova(
lm(reducccol ~ tratamiento.f, data = colesterol),
lm(reducccol ~ tratamiento.f + edad, data = colesterol),
lm(reducccol ~ tratamiento.f + edad + tratamiento.f * edad, data = colesterol) # es igual a `tratamiento * edad`
)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
38 | 2564.3755 | NA | NA | NA | NA |
37 | 730.1896 | 1 | 1834.185945 | 90.7415941 | 0.0000000 |
36 | 727.6784 | 1 | 2.511214 | 0.1242358 | 0.7265403 |
housing <-
readxl::read_xls("Housing.xls") %>%
rowid_to_column("obs_id")
modelo_housing <- lm(STARTS ~ POP, data = housing)
ggplot(data = augment(modelo_housing, data = housing)) +
aes(x = obs_id, y = .std.resid) +
geom_hline(yintercept = 0, color = "Gray") +
geom_point() +
geom_smooth(se = F) +
geom_line(aes(group = 1)) +
labs(title = "Residuos estandarizados en el tiempo",
y = "Residuos estandarizados",
x = "Nro de observación")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
MASS::studres
te da los residuos (externamente) estudentizados con el modelo:
augment(modelo_housing) %>%
mutate(.stud.resid = MASS::studres(modelo_housing)) %>%
ggplot() +
aes(x = .std.resid, y = .stud.resid) +
geom_segment(aes(xend = .std.resid, yend = .std.resid), color = "Red") +
geom_abline(color = "Gray") +
geom_point() +
labs(title = "Residuos estandarizados vs. estudentizados")
\[ H_{0}: ρ = 0 \quad \text{ vs. } \quad H_{1}: ρ \neq 0 \\ \text{ Estadistico d de DW: } \\ d = \frac{ \sum_{i=2}^{n} (e_{i} - e_{i-1})^2 }{ \sum_{i=1}^{n} e_{i}^2 } \\ d \approx 2(1 - \hat{\rho}) \]
\[ d = 4 \implies \hat{\rho} = -1 \\ d = 0 \implies \hat{\rho} = 1 \\ d = 2 \implies \hat{\rho} = 0 \]
# Cálculo "a mano" del estadístico d o "DW"
n <- nrow(housing)
residuos <- modelo_housing$residuals
numerador_DW <- sum(diff(residuos, lag = 1)^2)
denominador_DW <- sum(residuos^2)
estadistico_DW <- numerador_DW / denominador_DW
estadistico_DW
## [1] 0.6208403
Comparar el valor de d con la regla de arriba.
lmtest::dwtest
toma el modelo y nos da el p-valor para el test de si la correlación entre errores (\(\rho\)) es diferente que cero (o mayor que cero, se puede testear también). Un resultado significativo quiere decir que HAY correlación entre los errores.
Acá se ve que un R^2 alto no implica que no haya otros problemas, como el de correlación de los errores.
paste0("R^2 del modelo housing: ", summary(modelo_housing)$r.squared %>% round(2))
## [1] "R^2 del modelo housing: 0.93"
dw_test_housing <- lmtest::dwtest(modelo_housing, alternative = "two.sided",
data = housing)
dw_test_housing
##
## Durbin-Watson test
##
## data: modelo_housing
## DW = 0.62084, p-value = 1.329e-05
## alternative hypothesis: true autocorrelation is not 0
Se quiere estimar cuán correlacionado está cada error con el error anterior.
\[ \hat{\rho} = \frac{ \sum^{n}_{i=2} e_{i} e_{i-1} }{ \sum^{n}_{i=2} e^{2}_{i} } \]
estimar_rho_sombrero <- function(residuos) {
n <- length(residuos)
numerador <- sum(head(residuos, n-1) * tail(residuos, n-1))
denominador <- sum(residuos^2)
return (numerador/denominador)
}
rho_estimado <- estimar_rho_sombrero(modelo_housing$residuals)
\[ \text{Modelo original: } Y_{t} = \alpha + \beta X_{t} + \varepsilon_{t} \\ \varepsilon_{t} = \rho \varepsilon_{t-1} + u_{t} \\ \text{Transformacion: } Y_{t} - \rho Y_{t-1} \\ ... \\ Y^{*}_{t} = Y_{t} - \hat{\rho} Y_{t-1} \\ X^{*}_{t} = X_{t} - \hat{\rho} X_{t-1} \\ \text{Modelo nuevo: } Y^{*}_{t} = \alpha^{*} + \beta^{*} X_{t}^{*} + u_{t} \\ u_{t} \sim \mathcal{N}(0, \sigma^2) \\ \alpha^{*} = \alpha (1 - \rho) \\ \beta^{*} = \beta \]
moneda <- readxl::read_xls("Moneda.xls")
modelo_moneda <- lm(gasto ~ stock, data = moneda)
lmtest::dwtest(modelo_moneda)
##
## Durbin-Watson test
##
## data: modelo_moneda
## DW = 0.32821, p-value = 2.303e-08
## alternative hypothesis: true autocorrelation is greater than 0
rho_estimado <- estimar_rho_sombrero(modelo_moneda$residuals)
moneda %<>% mutate(
`gasto*` = gasto - rho_estimado * lag(gasto),
`stock*` = stock - rho_estimado * lag(stock)
)
modelo_moneda_descorrelacionado <- lm(`gasto*` ~ `stock*`, data = moneda)
lmtest::dwtest(modelo_moneda_descorrelacionado)
##
## Durbin-Watson test
##
## data: modelo_moneda_descorrelacionado
## DW = 1.426, p-value = 0.05786
## alternative hypothesis: true autocorrelation is greater than 0
alfa_estrella <- modelo_moneda_descorrelacionado$coefficients[[1]]
beta_estrella <- modelo_moneda_descorrelacionado$coefficients[[2]]
alfa_original <- alfa_estrella/(1 - rho_estimado)
beta_original <- beta_estrella # Son el mismo
bind_rows(
tidy(modelo_moneda$coefficients) %>% mutate(modelo = "original"),
tidy(modelo_moneda_descorrelacionado$coefficients) %>% mutate(modelo = "transformado*"),
tibble(names = c("(Intercept)", "gasto*"),
x = c(alfa_original, beta_original),
modelo = rep("original_descorrelacionado", 2))
)
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
names | x | modelo |
---|---|---|
(Intercept) | -154.719162 | original |
stock | 2.300371 | original |
(Intercept) | -53.695919 | transformado* |
stock* |
2.643443 | transformado* |
(Intercept) | -215.310969 | original_descorrelacionado |
gasto* | 2.643443 | original_descorrelacionado |
Si da muy significativo, es pista de que los errores NO tienen distribución normal! (H0 es que los errores son normales)
# shapiro.test(modelo_moneda$residuals)
# Mejor usar los residuos estandarizados:
shapiro.test(augment(modelo_moneda)$.std.resid)
##
## Shapiro-Wilk normality test
##
## data: augment(modelo_moneda)$.std.resid
## W = 0.92539, p-value = 0.1259
ggplot(data = augment(modelo_moneda, data = moneda)) +
aes(sample = .std.resid) +
geom_qq(color = "Red") +
geom_qq_line(color = "SeaGreen") +
labs(title = "Comparación de cuantiles",
x = "Cuantiles teóricos\n(distribución normal)",
y = "Residuos estandarizados\n(distribución empírica)")
salud <-
read_delim("salud.txt", delim = " ") %>%
rename(peso = X1, pulso = X2, fuerza = X3, t_cuarto_milla = X4, t_una_milla = Y)
## Parsed with column specification:
## cols(
## obs = col_integer(),
## X1 = col_integer(),
## X2 = col_integer(),
## X3 = col_integer(),
## X4 = col_integer(),
## Y = col_integer()
## )
modelo_salud <- lm(t_una_milla ~ peso + pulso + fuerza + t_cuarto_milla, data = salud)
salud <- augment(modelo_salud, data = salud)
ggplot(data = salud) +
aes(y = .std.resid, x = "") +
geom_boxplot() +
geom_rug(sides = "t", color = "Red") +
geom_point() +
geom_label_repel(
data = top_n(salud, 10, abs(.std.resid)),
mapping = aes(label = obs),
nudge_x = .3
) +
coord_flip() +
labs(title = "Residuos estandarizados")
\[ \bigg( X (X^T X)^{-1} X^T \bigg)_{ii} = \bigg( P_{\mathcal{C}(X)} \bigg)_{ii} = x_{i}^T (X^T X)^{-1} x_{i} = p_{ii} \text{ : leverage de la observación i} \]
p <- modelo_salud$rank
n <- length(salud)
pii_umbral <- 2 * p / n
ggplot(data = salud) +
aes(x = "", y = .hat) +
geom_boxplot(width = .5) +
geom_rug(sides = "b", color = "Red") +
geom_label_repel(mapping = aes(label = (obs)), data = top_n(salud, 10, .hat), nudge_x = .3) +
geom_hline(yintercept = pii_umbral, linetype = "dashed", color = "Red") +
geom_point() +
coord_flip() +
labs(title = "Leverage de las observaciones",
subtitle = "Y umbral de 2p/n")
\[ a_{i} = \frac{ e_{i} }{ \sqrt{e^T e}} \\ \text{Graficar: } a_{i}^2 \text{ vs. } p_{ii} \]
norma_del_vector_residuos <- sqrt(sum(modelo_salud$residuals^2))
salud %>%
mutate(a = .resid / norma_del_vector_residuos) %>%
ggplot() +
aes(x = a^2, y = .hat, color = .cooksd, size = .cooksd, alpha = .cooksd) +
geom_label(aes(label = obs), alpha = .5) +
labs(title = "Plot L-R",
subtitle = "Leverage vs. Residuos") +
scale_color_gradient(low = "Gray", high = "Red") +
guides(size = F)
Analizar casos diferentes: - Obs. 28: residuo grande, leverage alto, Cooks D máximo. Es una observación problematica! - Obs. 23: leverage alto, pero residuo chico. Es probable que el ajuste por LS se esté adaptando a esta observación de alto leverage, distorsionando los betas. Problemático! - Obs. 30: leverage bajo, residuo alto: parece una observación que el modelo “entiende” como lejana a la media, sin que sea un outlier.
\[ p_{ii} > 2 \frac{p}{n} \rightarrow \text{Problema} \\ \text{Distancia de Cook} = D_{i} = r_{i} \frac{h_{i}}{1 - h_{i}} \ge \mathcal{F}^{0.5}_{p, n-p} \rightarrow \text{Problema} \\ \]
p <- modelo_salud$rank
n <- length(salud)
pii_umbral <- 2 * p / n
Di_umbral <- qf(p = 0.5, df1 = p, df2 = n - p)
ggplot(data = salud) +
geom_point(aes(x = .hat, y = .cooksd)) +
gghighlight(percent_rank(.cooksd) > .75, label_key = obs) +
geom_hline(yintercept = Di_umbral, color = "Tomato") +
geom_vline(xintercept = pii_umbral, color = "Tomato") +
labs(title = "Observaciones atípicas",
subtitle = "En rojo los 'umbrales' de Di y Pii problemáticos",
x = "Leverage", y = "Distancia de Cook")
ggplot(salud) +
aes(x = "", y = .cooksd) +
geom_boxplot() +
geom_hline(yintercept = Di_umbral, color = "tomato") +
coord_flip() +
labs(title = "Distancia de Cook",
subtitle = "Y umbral problemático = F(0.5, p, n-p)")
gorgojo <-
readxl::read_xls("Gorgojo.xls") %>%
mutate(x = factor(x))
ggpairs(gorgojo, progress = F)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gorgojo %>% group_by(x) %>% summarise(n(), var(y), sd(y))
x | n() | var(y) | sd(y) |
---|---|---|---|
3 | 6 | 29.86667 | 5.465040 |
6 | 6 | 102.96667 | 10.147249 |
9 | 6 | 10.96667 | 3.311596 |
12 | 6 | 17.86667 | 4.226898 |
21 | 6 | 14.80000 | 3.847077 |
En la densidad empírica Y no parece normal. Además, parece haber varianzas diferentes entre niveles del factor.
Las transformaciones Box-Cox se usan para “orregir varianzas desiguales y principalmente para corregir la no linealidad”.
\[ Y^{(\lambda)} = \left \{ \begin{array}{ll} \frac{Y^{\lambda} - 1}{\lambda} \quad \text{si } \lambda \neq 0 \\ \mathrm{ln}(Y) \quad \text{si } \lambda = 0 \end{array} \right. \]
# No importar MASS porque pisa select :(
modelo_gorgojo <- lm(y ~ x, data = gorgojo)
MASS::boxcox(modelo_gorgojo)
lambdas <- EnvStats::boxcox(modelo_gorgojo)
lambda_probs <- tibble(
lambda = lambdas$lambda,
prob = lambdas$objective
)
ggplot(data = lambda_probs) +
aes(x = lambda, y = prob) +
geom_point() +
geom_line() +
labs(title = "Lambda óptimo para Box-Cox",
subtitle = "Según paquete EnvStats") +
geom_label_repel(data = top_n(lambda_probs, 3, prob),
mapping = aes(label = lambda),
nudge_y = -.025)
gorgojo %<>% mutate(
`Y(lambda=-1)` = (y^(-1) - 1)/(-1),
`Y(lambda=-0.5)` = (y^(0.5) - 1)/0.5,
`Y(lambda=0)` = log(y)
)
tidy_gorgojo <- gorgojo %>% gather("Y_name", "Y_value", -x)
ggplot(tidy_gorgojo) +
facet_grid(Y_name ~ ., scales = "free") +
aes(x = x, y = Y_value) +
geom_boxplot() +
labs(title = "Transformaciones Box-Cox")
modelo <- lm(y ~ x, data = gorgojo)
modelo_lambda_0 <- lm(log(y) ~ x, data = gorgojo)
modelo_lambda_minus05 <- lm(`Y(lambda=-0.5)` ~ x, data = gorgojo)
gorgojo_comparacion <- tibble(
.std.resid = c(rstandard(modelo),
rstandard(modelo_lambda_0),
rstandard(modelo_lambda_minus05)),
.fitted = c(modelo$fitted.values,
modelo_lambda_0$fitted.values,
modelo_lambda_minus05$fitted.values),
obs = rep(1:nrow(gorgojo), times = 3),
modelo = rep(c("original", "log_y", "lambda = -0.5"), each = nrow(gorgojo))
)
ggplot(gorgojo_comparacion) +
aes(x = obs, y = .std.resid, fill = modelo, color = modelo) +
geom_hline(yintercept = 0, color = "Gray") +
geom_smooth(se = F, size = .5) +
labs(title = "Residuos Y vs. Y(Box-Cox)",
subtitle = "Lambda = -0.5 parece mejorar algo el asunto") +
scale_color_manual(values = c("Blue", "Red", "ForestGreen"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
gdp <- read_csv("rdpercentGDP.csv") %>% rename(obs_id = X1)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## year = col_integer(),
## argen = col_double(),
## germany = col_double(),
## china = col_double(),
## japan = col_double(),
## france = col_double(),
## usa = col_double(),
## uk = col_double(),
## nld = col_double(),
## finland = col_double(),
## netherlands = col_double()
## )
cor(select(gdp, -obs_id, -year)) %>%
as_tibble() %>%
mutate(country_1 = names(.)) %>%
gather("country_2", "corr", -country_1) %>%
ggplot() +
aes(y = country_1, x = country_2, fill = corr) +
geom_tile(width = .95) +
scale_fill_gradient2(low = "Blue", mid = "White", high = "Red") +
geom_text(aes(label = round(corr, 2)))
modelo_gdp <- lm(usa ~ .,data = select(gdp, -obs_id))
X_ <- model.matrix(modelo_gdp)
autovalores <- eigen(t(X_) %*% X_)$values
nro_de_condicion <- max(autovalores)/min(autovalores)
tibble(autovalor = autovalores) %>%
rowid_to_column("rank") %>%
ggplot() +
aes(x = rank, y = autovalor) +
geom_point() +
geom_line() +
labs(title = glue("Autovalores de la matriz (Xt * X)"),
subtitle = glue("Número de condición: {round(nro_de_condicion)}"))
glmnet
toma la matriz de diseño y la variable dependiente por separado. Graficamos con log(lambda) porque con lambda es ilegible.
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-18
X_gdp <- model.matrix(modelo_gdp)
# alpha = 0 es ajuste ridge, alpha = 1 es ajuste lasso
ajuste_ridge <- glmnet(X_gdp, gdp$usa, alpha = 0) %>% tidy()
ajuste_ridge %>%
filter(term != "(Intercept)") %>%
ggplot() +
aes(x = log(lambda), y = estimate, group = term, color = term) +
geom_line() +
labs(title = "Lambdas de Ridge y sus betas estimados",
color = "Covariable",
y = "Coeficientes")
Lambda = 0 implica no penalizar la norma de \(\beta\), de modo que se recupera el \(\beta_{\text{OLS}}\), esimador de mínimos cuadrados:
ajuste_ridge_lambda_0 <- glmnet(X_gdp, gdp$usa, lambda = 0)
gdp %>% select(-obs_id, -year) %>% lm(usa ~ ., data = .)
##
## Call:
## lm(formula = usa ~ ., data = .)
##
## Coefficients:
## (Intercept) argen germany china japan
## 0.400959 0.291570 0.653356 -0.204354 0.102843
## france uk nld finland netherlands
## -0.001821 0.746867 -0.396645 -0.032093 NA
ajuste_ols <- lm(usa ~ ., data = select(gdp, -obs_id))
ajuste_ridge_lambda_0 <- glmnet(model.matrix(ajuste_ols), gdp$usa, lambda = 0)
ajuste_ols$coefficients
## (Intercept) year argen germany china
## -2.889575e+01 1.470147e-02 3.174547e-01 5.669843e-01 -3.562079e-01
## japan france uk nld finland
## 8.365393e-02 8.698457e-02 6.917669e-01 -3.546900e-01 -6.828865e-04
## netherlands
## NA
coef.glmnet(ajuste_ridge_lambda_0)[, 1]
## (Intercept) (Intercept) year argen germany
## -35.178501898 0.000000000 0.017856095 0.325324659 0.544470286
## china japan france uk nld
## -0.387198046 0.078691566 0.104895992 0.682955470 -0.353677131
## finland netherlands
## 0.006381637 0.009237243
Se puede elegir el lambda para Ridge con dos métodos:
ridge_crossval <- cv.glmnet(X_gdp, gdp$usa, nfolds = 5, alpha = 0)
glance(ridge_crossval)
lambda.min | lambda.1se |
---|---|
0.0099478 | 0.0639454 |
plot(ridge_crossval)
ridge_ols <- glmnet(X_gdp, gdp$usa, alpha = 0, lambda = 0)
ridge_1se <- glmnet(X_gdp, gdp$usa, alpha = 0, lambda = ridge_crossval$lambda.1se)
ridge_min <- glmnet(X_gdp, gdp$usa, alpha = 0, lambda = ridge_crossval$lambda.min)
ridge_coefficients <- bind_rows(
tidy(ridge_ols) %>% mutate(criterio = "OLS"),
tidy(ridge_1se) %>% mutate(criterio = "1SE"),
tidy(ridge_min) %>% mutate(criterio = "Min")
)
Elegimos el lambda de la misma manera que con Ridge: por CV con uno de los dos criterios.
Como Lasso mata covariables (lleva sus betas a cero), arriba aparece el número de cuántas covariables “conserva” con cada lambda.
# Alpha = 1 implica usar lasso
lasso_crossval <- cv.glmnet(X_gdp, gdp$usa, nfolds = 5, alpha = 1)
glance(lasso_crossval)
lambda.min | lambda.1se |
---|---|
0.0013777 | 0.0055616 |
plot(lasso_crossval)
ajuste_lasso <-
glmnet(X_gdp, gdp$usa, alpha = 1) %>%
tidy() %>%
filter(term != "(Intercept)")
ultimo_estimado <-
ajuste_lasso %>%
group_by(term) %>%
top_n(1, lambda)
ajuste_lasso %>%
ggplot() +
aes(x = log(lambda), y = estimate, group = term, color = term) +
geom_line() +
geom_point(data = ultimo_estimado, size = 3) +
geom_hline(yintercept = 0) +
labs(title = "Evolución de los betas estimados",
subtitle = "al cambiar lambda de Lasso",
color = "Covariable",
y = "Coeficientes")
lasso_1se <- glmnet(X_gdp, gdp$usa, alpha = 1, lambda = lasso_crossval$lambda.1se)
lasso_min <- glmnet(X_gdp, gdp$usa, alpha = 1, lambda = lasso_crossval$lambda.min)
lasso_ols <- glmnet(X_gdp, gdp$usa, alpha = 1, lambda = 0)
lasso_coefficients <- bind_rows(
tidy(lasso_ols) %>% mutate(criterio = "OLS"),
tidy(lasso_1se) %>% mutate(criterio = "1SE"),
tidy(lasso_min) %>% mutate(criterio = "Min")
)
bind_rows(
ridge_coefficients %>% mutate(metodo = "Ridge"),
lasso_coefficients %>% mutate(metodo = "Lasso")
) %>%
mutate(
is_ols = ifelse(lambda == 0, "OLS", "Con penalización"),
term = factor(term),
lambda = paste0(criterio, " Lambda = ", round(lambda, 3))
) %>%
filter(term != "(Intercept)") %>%
ggplot() +
facet_wrap(metodo ~ lambda, ncol = 3, scales = "free_y") +
aes(x = term, y = estimate, color = metodo, shape = is_ols) +
geom_point(size = 2.5) +
labs(title = "Coeficientes por Ridge y Lasso",
subtitle = "Lasso 'mata' muchas covariables. Ridge no.\nAmbos pueden recuperar OLS con lambda = 0.",
y = "Beta estimado", x = "Covariable",
shape = "OLS vs. Penalización") +
geom_hline(yintercept = 0) +
coord_flip() +
scale_shape_manual(values = c(16, 8)) +
theme(legend.position = "top", legend.justification = "left", legend.direction = "vertical")
library(RobStatTM)
## Loading required package: fit.models
data(wood, package='robustbase') # Carga el dataset `wood`
# lmrobdet.control() setea parámetros para el ajuste. Confiamos en los defaults.
wood_LS <- lm(y ~ ., data = wood)
wood_MM <- lmrobdetMM(y ~ ., data = wood, control = lmrobdet.control())
X_wood <- model.matrix(wood_MM)
P_ <- X_wood %*% solve(t(X_wood) %*% X_wood) %*% t(X_wood)
wood_LS_augmented <-
augment(wood_LS, data = wood) %>%
rowid_to_column("i")
wood_MM_augmented <-
wood %>%
mutate(.fitted = model.matrix(wood_MM) %*% wood_MM$coefficients,
.resid = y - .fitted,
i = 1:nrow(wood),
.hat = map_dbl(i, ~ P_[[.x, .x]]),
.std.resid = .resid/(sigma(wood_MM)*sqrt(1 - .hat)))
comparacion_LS_MM <-
bind_rows(
wood_LS_augmented %>% select(i, .fitted, .std.resid) %>% mutate(metodo = "LS"),
wood_MM_augmented %>% select(i, .fitted, .std.resid) %>% mutate(metodo = "MM")
)
obs_atipicas <-
comparacion_LS_MM %>%
filter(metodo == "MM") %>%
top_n(4, abs(.std.resid)) # Ya miré que son 4 ;)
ggplot(data = comparacion_LS_MM) +
aes(x = .fitted, y = .std.resid, color = metodo) +
geom_point() +
geom_segment(aes(xend = .fitted, yend = 0)) +
geom_hline(yintercept = 0) +
labs(title = "Residuos estandarizados: LS vs MM") +
geom_label_repel(data = comparacion_LS_MM %>% filter(i %in% obs_atipicas$i),
mapping = aes(label = i),
alpha = .65)
comparacion_LS_MM %>%
select(i, metodo, .std.resid) %>%
spread(key = "metodo", value = ".std.resid") %>%
rename(LS_std_resid = LS, MM_std_resid = MM) %>%
filter(!(i %in% c(4, 6, 8, 19))) %>%
ggplot() +
aes(x = sort(abs(LS_std_resid)), y = sort(abs(MM_std_resid))) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
xlim(0, 2.5) +
ylim(0, 2.5) +
labs(title = "Residuos de MM mucho menores*",
subtitle = "*Para las observaciones no-outliers")
comparacion_LS_MM %>%
ggplot() +
facet_wrap(metodo ~ ., scales = "free") +
aes(sample = .std.resid) +
aes(color = i %in% obs_atipicas$i) +
geom_qq() +
geom_qq_line(color = "SeaGreen") +
labs(title = "QQ-plots para LS vs MM",
x = "Cuantiles teóricos\n(distribución normal)",
y = "Cuantiles de los residuos estandarizados\n(distribución empírica)") +
guides(color = F)
El ajuste robusto le da peso 0 a las observaciones atípicas. Usamos esos pesos para rehacer el ajuste LS:
pesos <- wood_MM$rweights
tibble(obs = names(pesos), peso = pesos) %>%
ggplot() +
aes(x = fct_reorder(obs, as.integer(obs)), y = peso, color = peso > 0) +
geom_hline(yintercept = 0, color = "Gray") +
geom_hline(yintercept = 1, color = "Gray") +
geom_point() +
geom_segment(aes(xend = obs, yend = 0)) +
labs(title = "Pesos de las observaciones",
subtitle = "Según el ajuste robusto",
x = "Observación") +
guides(color = F)
# Weighted Least Squares
wood_WLS <- lm(y ~ ., data = wood, weights = wood_MM$rweights)
bind_rows(
glance(wood_LS) %>% mutate(metodo = "OLS"),
glance(wood_WLS) %>% mutate(metodo = "WLS")
) %>%
select(metodo, p.value, df.residual, adj.r.squared)
metodo | p.value | df.residual | adj.r.squared |
---|---|---|---|
OLS | 0.0001282 | 14 | 0.7399743 |
WLS | 0.0000023 | 10 | 0.9307745 |
Acá pruebo el lm robusto de MASS:
llamadas <- read_delim("llamadas.txt", delim = " ")
## Parsed with column specification:
## cols(
## obs = col_integer(),
## year = col_integer(),
## calls = col_double()
## )
llamadas_OLS <- lm(calls ~ year, data = llamadas)
llamadas_Rob <- MASS::rlm(calls ~ year, data = llamadas)
## Warning in rlm.default(x, y, weights, method = method, wt.method =
## wt.method, : 'rlm' failed to converge in 20 steps
comparacion_OLS_Rob <- bind_rows(
augment(llamadas_OLS) %>% mutate(metodo = "OLS"),
augment(llamadas_Rob) %>% mutate(metodo = "robusto")
) %>%
mutate(weight = rep(llamadas_Rob$w, times = 2))
comparacion_OLS_Rob %>%
ggplot() +
aes(x = year, y = calls, color = metodo) +
geom_point() +
geom_line(aes(y = .fitted, group = metodo)) +
geom_label_repel(
data = filter(comparacion_OLS_Rob, weight < 1 & metodo == "robusto"),
mapping = aes(label = round(weight, 2))
) +
labs(title = "MASS::rlm vs lm",
subtitle = "Se detallan los pesos < 1 dados por rlm a las observaciones")
La idea es testear algunos (o todos los) subconjuntos de covariables y quedarse con el que tenga mejor R^2 ajustado y el menor número de variables, entre aquellos donde el Cp de Mallows no es muy diferente de p. El método para elegir los subconjuntos de covariables puede cambiar: stepwise, back, forw, o ASR.
biomasa <- readxl::read_excel("Biomasa.xls")
forw <- leaps::regsubsets(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, method = "forward")
forw_result <- summary(forw)
forw_result
## Subset selection object
## Call: regsubsets.formula(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa,
## method = "forward")
## 5 Variables (and intercept)
## Forced in Forced out
## K FALSE FALSE
## SODIO FALSE FALSE
## PH FALSE FALSE
## SAL FALSE FALSE
## ZN FALSE FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: forward
## K SODIO PH SAL ZN
## 1 ( 1 ) " " " " "*" " " " "
## 2 ( 1 ) " " "*" "*" " " " "
## 3 ( 1 ) " " "*" "*" " " "*"
## 4 ( 1 ) " " "*" "*" "*" "*"
## 5 ( 1 ) "*" "*" "*" "*" "*"
forward_selection <- tibble(
rss = forw_result$rss,
r_squared = forw_result$rsq,
r_squared_adj = forw_result$adjr2,
Cp = forw_result$cp,
p = 1:length(Cp)
)
ggplot(forward_selection) +
aes(x = p, y = Cp) +
geom_point() +
geom_line() +
geom_abline(intercept = 0, slope = 1) +
labs(title = "Cp de Mallows")
forward_selection %>%
select(p, r_squared, r_squared_adj, rss) %>%
gather("name", "r2_value", -p, -rss) %>%
ggplot() +
aes(x = p, y = r2_value, color = name, group = name) +
geom_point() +
geom_line() +
theme(legend.position = "top") +
labs(title = "R2 y R2 ajustado eligiendo variables",
subtitle = "Método: forward selection",
color = "", y = "")
En este caso, buscamos el Cp y R^2 de todos los subconjuntos de covariables posibles!
ASR_call <- leaps::leaps(x = select(biomasa, K, SODIO, PH, SAL, ZN), y = biomasa$BIO, method = "Cp")
ASR_result <-
tibble(
model_size = ASR_call$size,
cp = ASR_call$Cp
) %>%
rowid_to_column("i") %>%
mutate(
which = map(i, ~ ASR_call$which[.x, ]),
names = map(which, ~ c("K", "SODIO", "PH", "SAL", "ZN")[.x]),
names_peek = map_chr(names, ~ str_c(.x, collapse = ", "))
)
lm_biomasa <- function(.names) {
.formula <- paste0("BIO ~ ", paste0(.names, collapse = " + "))
return (lm(.formula, data = biomasa))
}
ASR_result %<>%
mutate(modelo = map(names, lm_biomasa),
summary = map(modelo, summary),
r_squared_adj = map_dbl(summary, ~ .x$adj.r.squared),
p_value = map_dbl(modelo, ~ glance(.x)[[1, "p.value"]]))
mejores_subconjuntos <-
ASR_result %>%
group_by(model_size) %>%
filter(p_value <= 0.05) %>% # Que sea significativo
filter(abs(cp - p) < 2) %>% # Que el Cp esté cerca de p
top_n(1, r_squared_adj) %>% # El mejor ajuste para cada p
mutate(model_name = paste0("R2.adj = ", round(r_squared_adj, 3), " # ", names_peek))
ggplot(mejores_subconjuntos) +
aes(x = model_size, y = cp, color = model_name) +
geom_point(size = 3) +
geom_abline() +
theme(legend.position = "top", legend.direction = "vertical", legend.justification = "left") +
geom_text_repel(aes(label = round(r_squared_adj, 3)), nudge_y = -1.5) +
xlim(2, 7) + ylim(2, 7.5) +
geom_label_repel(aes(label = names_peek), nudge_y = 1) +
labs(title = "Los mejores modelos en ASR", color = NULL, x = "p", y = "Cp",
subtitle = "Significativos, Cp =~ p, top R2.adj")
# Si consideramos TODOS los subconjuntos de variables posibles, hay algunos con Cp muy diferente de p (malo!)
tibble(p = ASR_result$model_size, Cp = ASR_result$cp) %>%
ggplot() +
aes(x = p, y = Cp) +
geom_point() +
geom_abline()
back <- leaps::regsubsets(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, method = "backward")
stepwise <- leaps::regsubsets(BIO ~ K + SODIO + PH + SAL + ZN, data = biomasa, method = "seqrep")