In this practical, we explore computer-intensive methods that
approximate sampling distributions
using only the observed data and resampling:
We will use both simulated data and real data (built-in R datasets
like mtcars).
Suppose we observe i.i.d. data
\[
X_1, X_2, \ldots, X_n \sim F
\]
from an unknown distribution \(F\).
We want the sampling distribution of a statistic
\[
T = T(X_1, X_2, \ldots, X_n)
\]
(e.g., mean, median, regression coefficient).
Bootstrap Standard Error (SE): \[ \hat{SE}_{boot}(T) = \sqrt{\frac{1}{B-1} \sum_{b=1}^B (T_b^* - \bar{T}^*)^2} \]
Bootstrap Bias: \[ \hat{Bias}_{boot}(T) = \bar{T}^* - T_{obs} \]
Bootstrap Confidence Intervals (e.g., Percentile
CI):
Take empirical quantiles of \(T_b^*\).
We simulate from a skewed distribution (Exponential), where the
median is non-linear and
standard formulae for standard errors are less straightforward.
# Simulate data from an exponential distribution
n <- 100
lambda <- 1 # rate
x <- rexp(n, rate = lambda)
mean_obs <- mean(x)
median_obs <- median(x)
mean_obs
## [1] 1.045719
median_obs
## [1] 0.847754
B <- 2000
boot_mean <- numeric(B)
boot_median <- numeric(B)
for (b in 1:B) {
x_star <- sample(x, size = n, replace = TRUE)
boot_mean[b] <- mean(x_star)
boot_median[b] <- median(x_star)
}
# Bootstrap estimates
se_mean_boot <- sd(boot_mean)
se_median_boot <- sd(boot_median)
bias_mean_boot <- mean(boot_mean) - mean_obs
bias_median_boot <- mean(boot_median) - median_obs
list(
mean_obs = mean_obs,
se_mean_boot = se_mean_boot,
bias_mean_boot = bias_mean_boot,
median_obs = median_obs,
se_median_boot = se_median_boot,
bias_median_boot= bias_median_boot
)
## $mean_obs
## [1] 1.045719
##
## $se_mean_boot
## [1] 0.102229
##
## $bias_mean_boot
## [1] 0.00116082
##
## $median_obs
## [1] 0.847754
##
## $se_median_boot
## [1] 0.1403908
##
## $bias_median_boot
## [1] 0.007361685
alpha <- 0.05
ci_mean <- quantile(boot_mean, probs = c(alpha/2, 1 - alpha/2))
ci_median <- quantile(boot_median, probs = c(alpha/2, 1 - alpha/2))
ci_mean
## 2.5% 97.5%
## 0.8668418 1.2564491
ci_median
## 2.5% 97.5%
## 0.5826475 1.0741226
par(mfrow = c(1,2))
hist(boot_mean, breaks = 30, main = "Bootstrap Mean",
xlab = "Mean*", probability = TRUE)
abline(v = mean_obs, col = "red", lwd = 2)
hist(boot_median, breaks = 30, main = "Bootstrap Median",
xlab = "Median*", probability = TRUE)
abline(v = median_obs, col = "red", lwd = 2)
par(mfrow = c(1,1))
We now use the classic mtcars dataset.
We model:
\[ mpg = \beta_0 + \beta_1 \cdot wt + \beta_2 \cdot hp + \varepsilon \]
where: - \(mpg\) = miles per gallon
(response variable)
- \(wt\) = weight of the car
- \(hp\) = horsepower
- \(\varepsilon\) = error term
This allows us to estimate: - Bootstrap SEs for each
coefficient
- Bias of the estimates
- Confidence intervals (e.g., percentile intervals) for
\(\beta_0, \beta_1, \beta_2\)
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
fit_lm <- lm(mpg ~ wt + hp, data = mtcars)
summary(fit_lm)
##
## Call:
## lm(formula = mpg ~ wt + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.941 -1.600 -0.182 1.050 5.854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.22727 1.59879 23.285 < 2e-16 ***
## wt -3.87783 0.63273 -6.129 1.12e-06 ***
## hp -0.03177 0.00903 -3.519 0.00145 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.593 on 29 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8148
## F-statistic: 69.21 on 2 and 29 DF, p-value: 9.109e-12
coef_obs <- coef(fit_lm)
coef_obs
## (Intercept) wt hp
## 37.22727012 -3.87783074 -0.03177295
B <- 2000
n <- nrow(mtcars)
boot_coefs <- matrix(NA, nrow = B, ncol = length(coef_obs))
colnames(boot_coefs) <- names(coef_obs)
for (b in 1:B) {
idx <- sample(1:n, size = n, replace = TRUE)
dat_star <- mtcars[idx, ]
fit_star <- lm(mpg ~ wt + hp, data = dat_star)
boot_coefs[b, ] <- coef(fit_star)
}
boot_se <- apply(boot_coefs, 2, sd)
boot_bias <- colMeans(boot_coefs) - coef_obs
boot_se
## (Intercept) wt hp
## 2.133899525 0.712481184 0.007534983
boot_bias
## (Intercept) wt hp
## 0.0507673972 -0.0127911937 -0.0008579728
alpha <- 0.05
boot_ci <- apply(boot_coefs, 2, quantile, probs = c(alpha/2, 1 - alpha/2))
boot_ci
## (Intercept) wt hp
## 2.5% 33.04878 -5.333170 -0.04910424
## 97.5% 41.56600 -2.488106 -0.01991454
We can compare these with the usual t-based confidence intervals from summary(lm).
Jackknife is another resampling method, based on systematic deletion.
Given a statistic: \[ T = T(X_1, X_2, \ldots, X_n) \]
For each \(i = 1, \ldots, n\), remove \(X_i\) and recompute the statistic: \[ T^{(i)} = T(X_1, \ldots, X_{i-1}, X_{i+1}, \ldots, X_n) \]
Define the jackknife average: \[ \bar{T}^{(\cdot)} = \frac{1}{n} \sum_{i=1}^n T^{(i)} \]
Jackknife variance estimate: \[ \widehat{\text{Var}}_{JK}(T) = \frac{n-1}{n} \sum_{i=1}^n \left( T^{(i)} - \bar{T}^{(\cdot)} \right)^2 \]
# Finite population of size N
N <- 500
pop_y <- rnorm(N, mean = 50, sd = 10) # e.g. income-like variable
# True population mean
mu_pop <- mean(pop_y)
mu_pop
## [1] 50.19899
# Draw a simple random sample without replacement (survey style)
n <- 50
s_idx <- sample(1:N, size = n, replace = FALSE)
y_s <- pop_y[s_idx]
# Sample mean (estimator of population mean)
t_hat <- mean(y_s)
t_hat
## [1] 48.21283
# Jackknife leave-one-out estimates
t_jack <- numeric(n)
for (i in 1:n) {
t_jack[i] <- mean(y_s[-i])
}
t_bar_dot <- mean(t_jack)
# Jackknife variance
var_jack <- (n - 1) / n * sum((t_jack - t_bar_dot)^2)
se_jack <- sqrt(var_jack)
list(
sample_mean = t_hat,
jackknife_se = se_jack
)
## $sample_mean
## [1] 48.21283
##
## $jackknife_se
## [1] 1.457039
We can compare with the usual SRS variance estimator:
# Classical SE of mean for SRSWOR (finite population correction)
s2 <- var(y_s)
fpc <- (N - n) / (N - 1)
se_srs <- sqrt(s2 / n * fpc)
se_srs
## [1] 1.383653
Consider a finite population with two variables:
- \(Y\) (e.g., total expenditure)
- \(X\) (e.g., household size)
We want to estimate the population ratio: \[ R = \frac{\sum Y}{\sum X} \]
# Finite population with X and Y
set.seed(123)
N <- 500
X_pop <- rpois(N, lambda = 5) + 1 # e.g. household size
Y_pop <- 100 + 10 * X_pop + rnorm(N, 0, 20) # expenditure
R_pop <- sum(Y_pop) / sum(X_pop)
R_pop
## [1] 26.72235
Take an SRSWOR sample and compute the ratio estimator:
n <- 60
s_idx <- sample(1:N, n, replace = FALSE)
X_s <- X_pop[s_idx]
Y_s <- Y_pop[s_idx]
R_hat <- sum(Y_s) / sum(X_s)
R_hat
## [1] 26.32323
R_jack <- numeric(n)
for (i in 1:n) {
X_minus_i <- X_s[-i]
Y_minus_i <- Y_s[-i]
R_jack[i] <- sum(Y_minus_i) / sum(X_minus_i)
}
R_bar_dot <- mean(R_jack)
var_R_jack <- (n - 1) / n * sum((R_jack - R_bar_dot)^2)
se_R_jack <- sqrt(var_R_jack)
list(
R_hat = R_hat,
jackknife_se_R = se_R_jack
)
## $R_hat
## [1] 26.32323
##
## $jackknife_se_R
## [1] 0.8731723
This mimics the use of jackknife in sample survey analysis for complex, nonlinear estimators.
Cross-validation (CV) is used primarily for model
selection and tuning hyperparameters.
The most common approach is K-fold
cross-validation.
This provides a robust estimate of model performance and helps prevent overfitting.
We simulate data from a quadratic model, but pretend we do not know the true degree.
set.seed(2024)
n <- 200
x <- runif(n, -2, 2)
y <- 1 + 2 * x - 3 * x^2 + rnorm(n, sd = 1)
plot(x, y, main = "Simulated Data (Quadratic Relationship)")
## We compare polynomial degrees 1 to 5 using 10-fold CV.
K <- 10
folds <- sample(rep(1:K, length.out = n))
degrees <- 1:5
cv_errors <- numeric(length(degrees))
for (d in degrees) {
fold_mse <- numeric(K)
for (k in 1:K) {
test_idx <- which(folds == k)
train_idx <- setdiff(1:n, test_idx)
x_train <- x[train_idx]
y_train <- y[train_idx]
x_test <- x[test_idx]
y_test <- y[test_idx]
# Fit polynomial of degree d
fit_d <- lm(y_train ~ poly(x_train, degree = d, raw = TRUE))
y_pred <- predict(fit_d, newdata = data.frame(x_train = x_test))
fold_mse[k] <- mean((y_test - y_pred)^2)
}
cv_errors[d] <- mean(fold_mse)
}
data.frame(
degree = degrees,
cv_mse = cv_errors
)
## degree cv_mse
## 1 1 14.7734847
## 2 2 0.9378318
## 3 3 0.8953893
## 4 4 0.8965147
## 5 5 0.9192455
plot(degrees, cv_errors, type = "b", xlab = "Polynomial Degree",
ylab = "10-fold CV MSE", main = "Choosing Degree via Cross-Validation")
We illustrate K-fold cross-validation (CV) for choosing whether to include a quadratic term in regression.
Model 1: \[ mpg \sim wt + hp \]
Model 2: \[ mpg \sim wt + hp + I(wt^2) \]
mtcars dataset into \(K\) folds (e.g., \(K = 5\) or \(10\)).data(mtcars)
y <- mtcars$mpg
X1 <- mtcars[, c("wt", "hp")]
X2 <- data.frame(wt = mtcars$wt,
hp = mtcars$hp,
wt2 = mtcars$wt^2)
n <- nrow(mtcars)
K <- 5
set.seed(42)
folds <- sample(rep(1:K, length.out = n))
cv_err_m1 <- numeric(K)
cv_err_m2 <- numeric(K)
for (k in 1:K) {
test_idx <- which(folds == k)
train_idx <- setdiff(1:n, test_idx)
# Model 1
fit1 <- lm(y ~ ., data = data.frame(y = y[train_idx], X1[train_idx, ]))
pred1 <- predict(fit1, newdata = X1[test_idx, ])
cv_err_m1[k] <- mean((y[test_idx] - pred1)^2)
# Model 2
fit2 <- lm(y ~ ., data = data.frame(y = y[train_idx], X2[train_idx, ]))
pred2 <- predict(fit2, newdata = X2[test_idx, ])
cv_err_m2[k] <- mean((y[test_idx] - pred2)^2)
}
mean_cv_m1 <- mean(cv_err_m1)
mean_cv_m2 <- mean(cv_err_m2)
c(
mean_cv_m1 = mean_cv_m1,
mean_cv_m2 = mean_cv_m2
)
## mean_cv_m1 mean_cv_m2
## 7.887135 5.320388
These methods together form a powerful practical
toolkit for modern data analysis,
especially when analytic formulae are difficult or unreliable.