knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file()) # Set WD to Root
here::i_am("lab_mod/ch5-bootstrap.Rmd")
## here() starts at /Users/kittipos/Desktop/R_Programming/edx/ISLR
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
theme_set(theme_bw())
I will demonstrate how to do block bootstrap using Xy dataset from ISLR’s practice questions in the chapter 5.
Xy <- get(load(here("data/5.R.RData")))
Data Frame Xy has 3 numeric variables
str(Xy)
## 'data.frame': 1000 obs. of 3 variables:
## $ X1: num 1.3 1.27 1.24 1.21 1.18 ...
## $ X2: num 0.806 0.799 0.792 0.785 0.778 ...
## $ y : num 0.299 0.318 0.337 0.356 0.375 ...
summary(Xy)
## X1 X2 y
## Min. :-0.8068 Min. :-1.1753 Min. :-0.75293
## 1st Qu.:-0.1674 1st Qu.:-0.2339 1st Qu.:-0.06136
## Median : 0.2798 Median : 0.1824 Median : 0.30452
## Mean : 0.3337 Mean : 0.1288 Mean : 0.35471
## 3rd Qu.: 0.7017 3rd Qu.: 0.4646 3rd Qu.: 0.73283
## Max. : 1.8531 Max. : 1.7658 Max. : 2.03922
Supposed we want to fit a linear regression model of y on X1 and X2.
Xy_lm.fit <- lm(y ~ X1 + X2, data = Xy)
summary(Xy_lm.fit)
##
## Call:
## lm(formula = y ~ X1 + X2, data = Xy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.44171 -0.25468 -0.01736 0.33081 1.45860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.26583 0.01988 13.372 < 2e-16 ***
## X1 0.14533 0.02593 5.604 2.71e-08 ***
## X2 0.31337 0.02923 10.722 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5451 on 997 degrees of freedom
## Multiple R-squared: 0.1171, Adjusted R-squared: 0.1154
## F-statistic: 66.14 on 2 and 997 DF, p-value: < 2.2e-16
As you can see that \(S.E.\) of \(\beta_2\) was 0.0292267
Write a helper function to shuffles rows of Xy, fits the Linear model and return \(\beta_2\).
Xy_lm_boot_fn <- function(data, index){
lm_coef <- coef(lm(y ~ X1 + X2, data = data, subset = index))
lm_coef["X2"]
}
Let’s test the some iterations of the bootstapped \(\beta_2\).
set.seed(123)
index1 <- sample(1000, 1000, replace = TRUE)
index2 <- sample(1000, 1000, replace = TRUE)
Xy_lm_boot_fn(Xy, index1)
## X2
## 0.3113108
Xy_lm_boot_fn(Xy, index2)
## X2
## 0.324674
OK, It works!
I will use {boot} package to compute bootstrap of 1000 replicates using Xy_lm_boot_fn() as statistic argument in boot().
library(boot)
Xy_lm_boot <- boot(Xy, Xy_lm_boot_fn, R = 1000)
Xy_lm_boot
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Xy, statistic = Xy_lm_boot_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.313367 -0.0009827422 0.03595101
The \(S.E.\) of the bootstapped \(\beta_2\) is 0.035951
Why we need to do block bootstrap ?
Let’s see plots of observation in x-axis and each variables in y-axis. I will use {ggplot2} to do this.
First I need to add rownames to column and pivot to long format.
Xy_to_long <- function(data) {
data %>%
tibble::rownames_to_column("Observations") %>%
dplyr::mutate(Observations = as.numeric(Observations)) %>%
tidyr::pivot_longer(cols = X1:y, names_to = "Variables", values_to = "Values") %>%
dplyr::mutate(Variables = factor(Variables, levels = c( "y", "X1", "X2")))
}
Xy_long <- Xy_to_long(Xy)
head(Xy_long)
And then plot using geom_line:
Xy_long %>%
ggplot(aes(Observations, Values, color = Variables, linetype = Variables)) +
geom_line(aes(group = Variables)) +
labs(title = "Auto-Correlation between Variables",
caption = "\"Xy\" dataset from ISLR")
(You can also do the same thing by using base R plot by matplot(Xy,type="l") as in ISLR’s example.)
From this plot, It can be seen that there are correlation between observations (rows). A given data point (e.g. \((X1_i, X2_i, Y_i)\)) would influence the value of next data points (e.g. \((X1_{i+1}, X2_{i+1}, Y_{i+1})\)). This is called auto-correlation.
Thus, using regular bootstap might rearrange the consecutive rows that are correlated and modify the result.
Let’s plot observation vs values after bootstapped.
set.seed(123)
## Shuffle Rows
index1 <- sample(1000, 1000, replace = TRUE)
Xy_row_shuffle1 <- Xy[index1, ]
rownames(Xy_row_shuffle1) <- 1:1000
Xy_to_long(Xy_row_shuffle1) %>%
ggplot(aes(Observations, Values, color = Variables, linetype = Variables)) +
geom_line(aes(group = Variables)) +
labs(title = "Lost Auto-Correlation after Bootstrapped",
caption = "bootstapped \"Xy\" dataset from ISLR")
I’ve created a helper function to obtain an indexes of block bootstrap with following arguments.
n_block: Total number of blockssize: Number of observation in each blockget_block_index <- function(n_block, # Number of block
size # Number of observation in each block
) {
block1 <- 1:size
step <- 0:(n_block-1)*size
step_boot <- sample(step, n_block, replace = TRUE)
step_expand <- rep(step_boot, each = size)
block_expand <- rep(block1, n_block)
block_expand + step_expand
}
Let’s test get_block_index() function.
For example: 3 blocks with 5 observations each.
set.seed(123)
get_block_index(n_block = 3,size = 5)
## [1] 11 12 13 14 15 11 12 13 14 15 11 12 13 14 15
get_block_index(n_block = 3,size = 5)
## [1] 6 7 8 9 10 11 12 13 14 15 6 7 8 9 10
Ok, Looks like it works!
I will add “block” column containing numeric vector form 1-10 to flag each block so we can keep tract of data when we performed a block bootstrap.
Xy_block10 <- Xy
# Add "block" column to keep tract
Xy_block10$block <- factor(rep(1:10, each = 100))
summary(Xy_block10)
## X1 X2 y block
## Min. :-0.8068 Min. :-1.1753 Min. :-0.75293 1 :100
## 1st Qu.:-0.1674 1st Qu.:-0.2339 1st Qu.:-0.06136 2 :100
## Median : 0.2798 Median : 0.1824 Median : 0.30452 3 :100
## Mean : 0.3337 Mean : 0.1288 Mean : 0.35471 4 :100
## 3rd Qu.: 0.7017 3rd Qu.: 0.4646 3rd Qu.: 0.73283 5 :100
## Max. : 1.8531 Max. : 1.7658 Max. : 2.03922 6 :100
## (Other):400
Now, try simulate block bootstapped dataset:
set.seed(123)
index_block10_a <- get_block_index(n_block = 10, size = 100)
index_block10_b <- get_block_index(n_block = 10, size = 100)
Xy_block10[index_block10_a, ]$block %>% unique()
## [1] 3 10 2 6 5 4 9
## Levels: 1 2 3 4 5 6 7 8 9 10
Xy_block10[index_block10_b, ]$block %>% unique()
## [1] 5 3 9 8 10 7
## Levels: 1 2 3 4 5 6 7 8 9 10
Finally, I will perform block bootstrapped of the coefficient \(\beta_2\).
Recall that \(\beta_2\) is a coefficient of X2 in the linear regression model of y on X1 and X2.
I will need to create this final wrapper function Xy_lm_boot_fn_block(). The main argument were the followings:
data: Xy datasetn_block: Total number of blocks to bootstrapsize: Number of observation in each blockR: The number of bootstrap replicatesXy_lm_boot_fn_block <- function(data,
n_block = 10, size = 100,
R,
return_raw = FALSE
) {
coeffs <- numeric(n_block)
for (i in 1:R) {
index <- get_block_index(n_block, size)
coeffs[i] <- Xy_lm_boot_fn(data = data, index = index)
}
if(return_raw) return(coeffs)
list(original = mean(coeffs),
std.err = sd(coeffs))
}
Block bootstrapped of linear model y on X1 and X2; then extract mean and standard error of bootstrapped coefficient X2 (\(\beta_2\)).
Xy_lm_boot_fn_block(Xy_block10, n_block = 10, size = 100, R = 1000)
## $original
## [1] 0.3862743
##
## $std.err
## [1] 0.3200023
Compare it with regular bootstrap
Xy_lm_boot
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Xy, statistic = Xy_lm_boot_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.313367 -0.0009827422 0.03595101
And with no bootstrap (X2)
summary(Xy_lm.fit)
##
## Call:
## lm(formula = y ~ X1 + X2, data = Xy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.44171 -0.25468 -0.01736 0.33081 1.45860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.26583 0.01988 13.372 < 2e-16 ***
## X1 0.14533 0.02593 5.604 2.71e-08 ***
## X2 0.31337 0.02923 10.722 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5451 on 997 degrees of freedom
## Multiple R-squared: 0.1171, Adjusted R-squared: 0.1154
## F-statistic: 66.14 on 2 and 997 DF, p-value: < 2.2e-16
Xy_boot_coeffs <- Xy_lm_boot_fn_block(Xy_block10, n_block = 10, size = 100, R = 1000,
return_raw = TRUE)
library(latex2exp)
data.frame(beta2 = Xy_boot_coeffs) %>%
ggplot(aes(beta2)) +
geom_histogram(fill = "grey", color = "black") +
geom_vline(aes(xintercept = mean(Xy_boot_coeffs)), color = "red") +
annotate("label", x = mean(Xy_boot_coeffs), y = 125,
label = "mean",
color = "red") +
labs(x = TeX("$\\hat{\\beta^*_2}$"), y = "Count",
title = TeX("Distribution of block bootstrapped $\\beta_2$"))