if (!requireNamespace("MASS", quietly = TRUE)) install.packages("MASS")
if (!requireNamespace("boot", quietly = TRUE)) install.packages("boot")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
library(MASS)
library(boot)
library(ggplot2)
We use the built-in quakes dataset (R’s
datasets package), which records \(n = 1000\) seismic events near Fiji since
1964. It is real-world observational data (not a simulation). We
take
mag — Richter
magnitude of the eventdepth — focal
depth of the event (km)Both variables are continuous.
data(quakes)
dat <- quakes[, c("mag", "depth")]
names(dat) <- c("X", "Y")
n <- nrow(dat)
n
## [1] 1000
summary(dat)
## X Y
## Min. :4.00 Min. : 40.0
## 1st Qu.:4.30 1st Qu.: 99.0
## Median :4.60 Median :247.0
## Mean :4.62 Mean :311.4
## 3rd Qu.:4.90 3rd Qu.:543.0
## Max. :6.40 Max. :680.0
p1 <- ggplot(dat, aes(x = X)) +
geom_histogram(bins = 30, fill = "steelblue", colour = "white") +
labs(title = "Histogram of X (mag)", x = "Richter magnitude", y = "Count") +
theme_minimal()
p2 <- ggplot(dat, aes(x = Y)) +
geom_histogram(bins = 30, fill = "tomato", colour = "white") +
labs(title = "Histogram of Y (depth)", x = "Depth (km)", y = "Count") +
theme_minimal()
p3 <- ggplot(dat, aes(sample = X)) +
stat_qq(colour = "steelblue") + stat_qq_line() +
labs(title = "Q-Q plot of X (mag)",
x = "Theoretical quantiles", y = "Sample quantiles") +
theme_minimal()
p4 <- ggplot(dat, aes(sample = Y)) +
stat_qq(colour = "tomato") + stat_qq_line() +
labs(title = "Q-Q plot of Y (depth)",
x = "Theoretical quantiles", y = "Sample quantiles") +
theme_minimal()
if (requireNamespace("gridExtra", quietly = TRUE)) {
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
} else {
print(p1); print(p2); print(p3); print(p4)
}
sw_X <- shapiro.test(dat$X)
sw_Y <- shapiro.test(dat$Y)
skewness <- function(z) mean((z - mean(z))^3) / sd(z)^3
kurtosis <- function(z) mean((z - mean(z))^4) / sd(z)^4 - 3
shape_tbl <- data.frame(
variable = c("X (mag)", "Y (depth)"),
mean = c(mean(dat$X), mean(dat$Y)),
sd = c(sd(dat$X), sd(dat$Y)),
skewness = c(skewness(dat$X), skewness(dat$Y)),
ex_kurtosis = c(kurtosis(dat$X), kurtosis(dat$Y)),
shapiro_W = c(sw_X$statistic, sw_Y$statistic),
shapiro_p = c(sw_X$p.value, sw_Y$p.value)
)
knitr::kable(shape_tbl, digits = 4,
caption = "Normality diagnostics for X and Y")
| variable | mean | sd | skewness | ex_kurtosis | shapiro_W | shapiro_p |
|---|---|---|---|---|---|---|
| X (mag) | 4.6204 | 0.4028 | 0.7674 | 0.5033 | 0.9538 | 0 |
| Y (depth) | 311.3710 | 215.5355 | 0.1979 | -1.5980 | 0.8698 | 0 |
Conclusion of Part 1. The Q-Q plots show clear departures from normality in both variables. \(X\) (magnitude) is right-skewed and somewhat discretised because magnitudes are reported to one decimal place. \(Y\) (depth) is strongly bimodal — shallow events near 50 km coexist with deep-focus events near 500 km — and is nowhere near Gaussian. Shapiro–Wilk rejects normality at any conventional level for both variables. So neither \(X\) nor \(Y\) is normally distributed.
For the means we use the Student-\(t\) interval
\[ \bar{x} \;\pm\; t_{n-1,\,0.975}\,\frac{s}{\sqrt{n}}. \]
For the Pearson correlation \(\rho\) we use Fisher’s \(z\) transformation:
\[ z = \tfrac{1}{2}\log\!\frac{1+r}{1-r}, \qquad z \sim \mathcal{N}\!\left(\tfrac{1}{2}\log\tfrac{1+\rho}{1-\rho},\;\tfrac{1}{n-3}\right), \]
so the 95% CI for \(\rho\) is obtained by inverting the \(z\)-interval \(z \pm 1.96/\sqrt{n-3}\).
ci_mean_X <- as.numeric(t.test(dat$X)$conf.int)
ci_mean_Y <- as.numeric(t.test(dat$Y)$conf.int)
ct <- cor.test(dat$X, dat$Y)
r <- unname(ct$estimate)
ci_corr <- as.numeric(ct$conf.int)
theory_tbl <- data.frame(
parameter = c("mean(X)", "mean(Y)", "cor(X,Y)"),
estimate = c(mean(dat$X), mean(dat$Y), r),
ci_lower = c(ci_mean_X[1], ci_mean_Y[1], ci_corr[1]),
ci_upper = c(ci_mean_X[2], ci_mean_Y[2], ci_corr[2])
)
knitr::kable(theory_tbl, digits = 4,
caption = "Theoretical 95% confidence intervals")
| parameter | estimate | ci_lower | ci_upper |
|---|---|---|---|
| mean(X) | 4.6204 | 4.5954 | 4.6454 |
| mean(Y) | 311.3710 | 297.9960 | 324.7460 |
| cor(X,Y) | -0.2306 | -0.2885 | -0.1711 |
boot packageThe boot package needs a statistic with signature
function(data, indices). We define one statistic per
quantity of interest, call boot() to draw the resamples,
then boot.ci(type = "perc") for the percentile CI.
stat_mean_X <- function(data, i) mean(data$X[i])
stat_mean_Y <- function(data, i) mean(data$Y[i])
stat_cor <- function(data, i) cor(data$X[i], data$Y[i])
run_boot <- function(B) {
bX <- boot(dat, stat_mean_X, R = B)
bY <- boot(dat, stat_mean_Y, R = B)
bC <- boot(dat, stat_cor, R = B)
get_perc <- function(bo) {
ci <- boot.ci(bo, type = "perc")$percent
c(estimate = bo$t0,
ci_lower = ci[4],
ci_upper = ci[5],
se = sd(bo$t))
}
rbind(
data.frame(B = B, parameter = "mean(X)", as.list(get_perc(bX))),
data.frame(B = B, parameter = "mean(Y)", as.list(get_perc(bY))),
data.frame(B = B, parameter = "cor(X,Y)", as.list(get_perc(bC)))
)
}
B_vals <- c(500, 1000, 5000)
boot_results <- do.call(rbind, lapply(B_vals, run_boot))
knitr::kable(boot_results, digits = 4,
caption = "Percentile bootstrap 95% CIs from boot::boot.ci()")
| B | parameter | estimate | ci_lower | ci_upper | se |
|---|---|---|---|---|---|
| 500 | mean(X) | 4.6204 | 4.5933 | 4.6462 | 0.0132 |
| 500 | mean(Y) | 311.3710 | 297.4225 | 325.3042 | 6.9040 |
| 500 | cor(X,Y) | -0.2306 | -0.2857 | -0.1702 | 0.0296 |
| 1000 | mean(X) | 4.6204 | 4.5957 | 4.6464 | 0.0129 |
| 1000 | mean(Y) | 311.3710 | 298.5682 | 324.5658 | 6.6771 |
| 1000 | cor(X,Y) | -0.2306 | -0.2888 | -0.1712 | 0.0301 |
| 5000 | mean(X) | 4.6204 | 4.5959 | 4.6446 | 0.0126 |
| 5000 | mean(Y) | 311.3710 | 298.2781 | 324.6758 | 6.7425 |
| 5000 | cor(X,Y) | -0.2306 | -0.2883 | -0.1730 | 0.0291 |
B <- 5000
boot_mX <- boot(dat, stat_mean_X, R = B)
boot_mY <- boot(dat, stat_mean_Y, R = B)
boot_cr <- boot(dat, stat_cor, R = B)
boot_df <- data.frame(
value = c(boot_mX$t, boot_mY$t, boot_cr$t),
parameter = rep(c("mean(X)", "mean(Y)", "cor(X,Y)"), each = B)
)
ci_df <- data.frame(
parameter = c("mean(X)", "mean(X)", "mean(Y)", "mean(Y)",
"cor(X,Y)", "cor(X,Y)"),
endpoint = c(ci_mean_X, ci_mean_Y, ci_corr)
)
ggplot(boot_df, aes(x = value)) +
geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
geom_vline(data = ci_df, aes(xintercept = endpoint),
colour = "red", linetype = "dashed", linewidth = 0.7) +
facet_wrap(~ parameter, scales = "free", nrow = 1) +
labs(title = "Bootstrap sampling distributions (B = 5000)",
subtitle = "Red dashed lines: theoretical 95% CI endpoints from Part 2",
x = NULL, y = "Count") +
theme_minimal()
The red dashed lines mark the theoretical CI endpoints from Part 2; they sit right on the edges of the bootstrap histograms.
compare <- rbind(
data.frame(parameter = theory_tbl$parameter,
method = "Theoretical",
B = NA_integer_,
estimate = theory_tbl$estimate,
ci_lower = theory_tbl$ci_lower,
ci_upper = theory_tbl$ci_upper),
data.frame(parameter = boot_results$parameter,
method = "Bootstrap",
B = boot_results$B,
estimate = boot_results$estimate,
ci_lower = boot_results$ci_lower,
ci_upper = boot_results$ci_upper)
)
compare <- compare[order(compare$parameter, compare$method, compare$B), ]
rownames(compare) <- NULL
knitr::kable(compare, digits = 4,
caption = "Theoretical vs. bootstrap 95% CIs")
| parameter | method | B | estimate | ci_lower | ci_upper |
|---|---|---|---|---|---|
| cor(X,Y) | Bootstrap | 500 | -0.2306 | -0.2857 | -0.1702 |
| cor(X,Y) | Bootstrap | 1000 | -0.2306 | -0.2888 | -0.1712 |
| cor(X,Y) | Bootstrap | 5000 | -0.2306 | -0.2883 | -0.1730 |
| cor(X,Y) | Theoretical | NA | -0.2306 | -0.2885 | -0.1711 |
| mean(X) | Bootstrap | 500 | 4.6204 | 4.5933 | 4.6462 |
| mean(X) | Bootstrap | 1000 | 4.6204 | 4.5957 | 4.6464 |
| mean(X) | Bootstrap | 5000 | 4.6204 | 4.5959 | 4.6446 |
| mean(X) | Theoretical | NA | 4.6204 | 4.5954 | 4.6454 |
| mean(Y) | Bootstrap | 500 | 311.3710 | 297.4225 | 325.3042 |
| mean(Y) | Bootstrap | 1000 | 311.3710 | 298.5682 | 324.5658 |
| mean(Y) | Bootstrap | 5000 | 311.3710 | 298.2781 | 324.6758 |
| mean(Y) | Theoretical | NA | 311.3710 | 297.9960 | 324.7460 |
plot_df <- compare
plot_df$label <- ifelse(is.na(plot_df$B), "Theoretical",
paste0("Boot, B=", plot_df$B))
plot_df$label <- factor(plot_df$label,
levels = c("Theoretical", "Boot, B=500",
"Boot, B=1000", "Boot, B=5000"))
ggplot(plot_df, aes(x = label, y = estimate)) +
geom_point(size = 2) +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
facet_wrap(~ parameter, scales = "free_y") +
labs(title = "95% CIs: theoretical vs. bootstrap at three values of B",
x = NULL, y = "Estimate ± 95% CI") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 25, hjust = 1))
Effect of \(B\). Increasing the number of bootstrap replicates \(B\) shrinks the Monte-Carlo error in the estimated quantiles but does not change the asymptotic target the bootstrap is approximating. \(B = 500\) already agrees reasonably with the theoretical intervals, but the 2.5% and 97.5% quantiles are themselves noisy because each is determined by only \(\approx 0.025 B = 12\) replicates in the tails. By \(B = 1000\) the endpoints stabilise to two decimal places, and by \(B = 5000\) they are indistinguishable from the theoretical CIs for the two means, and very close for the correlation. A bigger \(B\) does not reduce statistical uncertainty (that is fixed by \(n\)) — it only reduces simulation error.
Does the normality of \(X\) and \(Y\) matter for the bootstrap?
So normality affects what the theoretical interval is approximating, more than it affects the bootstrap. Bootstrap accuracy depends mainly on \(n\) (so that the empirical CDF is close to the true CDF) and on the smoothness of the statistic, not on the marginal shape of \(X\) and \(Y\).
We draw a small subsample and rerun both procedures.
set.seed(7)
small_idx <- sample.int(n, 50)
small_dat <- dat[small_idx, ]
small_n <- nrow(small_dat)
ci_mX_s <- as.numeric(t.test(small_dat$X)$conf.int)
ci_mY_s <- as.numeric(t.test(small_dat$Y)$conf.int)
ci_r_s <- as.numeric(cor.test(small_dat$X, small_dat$Y)$conf.int)
sbX <- boot(small_dat, stat_mean_X, R = 5000)
sbY <- boot(small_dat, stat_mean_Y, R = 5000)
sbC <- boot(small_dat, stat_cor, R = 5000)
perc <- function(bo) boot.ci(bo, type = "perc")$percent[4:5]
small_compare <- data.frame(
parameter = c("mean(X)", "mean(Y)", "cor(X,Y)"),
theory_lower = c(ci_mX_s[1], ci_mY_s[1], ci_r_s[1]),
theory_upper = c(ci_mX_s[2], ci_mY_s[2], ci_r_s[2]),
boot_lower = c(perc(sbX)[1], perc(sbY)[1], perc(sbC)[1]),
boot_upper = c(perc(sbX)[2], perc(sbY)[2], perc(sbC)[2])
)
knitr::kable(small_compare, digits = 4,
caption = "n = 50: theoretical vs. percentile-bootstrap CIs")
| parameter | theory_lower | theory_upper | boot_lower | boot_upper |
|---|---|---|---|---|
| mean(X) | 4.4689 | 4.7271 | 4.4740 | 4.7300 |
| mean(Y) | 289.3673 | 414.1527 | 290.7850 | 412.3945 |
| cor(X,Y) | -0.4545 | 0.0812 | -0.4327 | 0.0359 |
Expectations and observations for small \(n\).
boot::boot.ci()
via type = "bca" or type = "stud" — correct
for bias and skewness and are preferred at small \(n\).In short, the bootstrap is an asymptotic procedure. It is excellent at \(n = 1000\), mediocre at \(n < 100\), and increasingly unreliable as \(n\) shrinks or as the data become more skewed/heavy-tailed.