if (!requireNamespace("MASS",    quietly = TRUE)) install.packages("MASS")
if (!requireNamespace("boot",    quietly = TRUE)) install.packages("boot")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")

library(MASS) 
library(boot)
library(ggplot2)

Part 1 — Real-world data and normality assessment

We use the built-in quakes dataset (R’s datasets package), which records \(n = 1000\) seismic events near Fiji since 1964. It is real-world observational data (not a simulation). We take

\(X\) = mag — Richter magnitude of the event
\(Y\) = depth — focal depth of the event (km)

Both variables are continuous.

data(quakes)
dat <- quakes[, c("mag", "depth")]
names(dat) <- c("X", "Y")
n <- nrow(dat)
n

## [1] 1000

summary(dat)

##        X              Y        
##  Min.   :4.00   Min.   : 40.0  
##  1st Qu.:4.30   1st Qu.: 99.0  
##  Median :4.60   Median :247.0  
##  Mean   :4.62   Mean   :311.4  
##  3rd Qu.:4.90   3rd Qu.:543.0  
##  Max.   :6.40   Max.   :680.0

Visual assessment with ggplot2

p1 <- ggplot(dat, aes(x = X)) +
  geom_histogram(bins = 30, fill = "steelblue", colour = "white") +
  labs(title = "Histogram of X (mag)", x = "Richter magnitude", y = "Count") +
  theme_minimal()

p2 <- ggplot(dat, aes(x = Y)) +
  geom_histogram(bins = 30, fill = "tomato", colour = "white") +
  labs(title = "Histogram of Y (depth)", x = "Depth (km)", y = "Count") +
  theme_minimal()

p3 <- ggplot(dat, aes(sample = X)) +
  stat_qq(colour = "steelblue") + stat_qq_line() +
  labs(title = "Q-Q plot of X (mag)",
       x = "Theoretical quantiles", y = "Sample quantiles") +
  theme_minimal()

p4 <- ggplot(dat, aes(sample = Y)) +
  stat_qq(colour = "tomato") + stat_qq_line() +
  labs(title = "Q-Q plot of Y (depth)",
       x = "Theoretical quantiles", y = "Sample quantiles") +
  theme_minimal()

if (requireNamespace("gridExtra", quietly = TRUE)) {
  gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
} else {
  print(p1); print(p2); print(p3); print(p4)
}

Formal tests and shape measures

sw_X <- shapiro.test(dat$X)
sw_Y <- shapiro.test(dat$Y)

skewness <- function(z) mean((z - mean(z))^3) / sd(z)^3
kurtosis <- function(z) mean((z - mean(z))^4) / sd(z)^4 - 3

shape_tbl <- data.frame(
  variable    = c("X (mag)", "Y (depth)"),
  mean        = c(mean(dat$X), mean(dat$Y)),
  sd          = c(sd(dat$X),   sd(dat$Y)),
  skewness    = c(skewness(dat$X), skewness(dat$Y)),
  ex_kurtosis = c(kurtosis(dat$X), kurtosis(dat$Y)),
  shapiro_W   = c(sw_X$statistic, sw_Y$statistic),
  shapiro_p   = c(sw_X$p.value,   sw_Y$p.value)
)
knitr::kable(shape_tbl, digits = 4,
             caption = "Normality diagnostics for X and Y")

Normality diagnostics for X and Y
variable	mean	sd	skewness	ex_kurtosis	shapiro_W	shapiro_p
X (mag)	4.6204	0.4028	0.7674	0.5033	0.9538	0
Y (depth)	311.3710	215.5355	0.1979	-1.5980	0.8698	0

Conclusion of Part 1. The Q-Q plots show clear departures from normality in both variables. \(X\) (magnitude) is right-skewed and somewhat discretised because magnitudes are reported to one decimal place. \(Y\) (depth) is strongly bimodal — shallow events near 50 km coexist with deep-focus events near 500 km — and is nowhere near Gaussian. Shapiro–Wilk rejects normality at any conventional level for both variables. So neither \(X\) nor \(Y\) is normally distributed.

Part 2 — Theoretical 95% confidence intervals

For the means we use the Student-\(t\) interval

\[ \bar{x} \;\pm\; t_{n-1,\,0.975}\,\frac{s}{\sqrt{n}}. \]

For the Pearson correlation \(\rho\) we use Fisher’s \(z\) transformation:

\[ z = \tfrac{1}{2}\log\!\frac{1+r}{1-r}, \qquad z \sim \mathcal{N}\!\left(\tfrac{1}{2}\log\tfrac{1+\rho}{1-\rho},\;\tfrac{1}{n-3}\right), \]

so the 95% CI for \(\rho\) is obtained by inverting the \(z\)-interval \(z \pm 1.96/\sqrt{n-3}\).

ci_mean_X <- as.numeric(t.test(dat$X)$conf.int)
ci_mean_Y <- as.numeric(t.test(dat$Y)$conf.int)

ct      <- cor.test(dat$X, dat$Y)
r       <- unname(ct$estimate)
ci_corr <- as.numeric(ct$conf.int)

theory_tbl <- data.frame(
  parameter = c("mean(X)", "mean(Y)", "cor(X,Y)"),
  estimate  = c(mean(dat$X), mean(dat$Y), r),
  ci_lower  = c(ci_mean_X[1], ci_mean_Y[1], ci_corr[1]),
  ci_upper  = c(ci_mean_X[2], ci_mean_Y[2], ci_corr[2])
)
knitr::kable(theory_tbl, digits = 4,
             caption = "Theoretical 95% confidence intervals")

Theoretical 95% confidence intervals
parameter	estimate	ci_lower	ci_upper
mean(X)	4.6204	4.5954	4.6454
mean(Y)	311.3710	297.9960	324.7460
cor(X,Y)	-0.2306	-0.2885	-0.1711

Part 3 — Bootstrap 95% confidence intervals via the `boot` package

The boot package needs a statistic with signature function(data, indices). We define one statistic per quantity of interest, call boot() to draw the resamples, then boot.ci(type = "perc") for the percentile CI.

stat_mean_X <- function(data, i) mean(data$X[i])
stat_mean_Y <- function(data, i) mean(data$Y[i])
stat_cor    <- function(data, i) cor(data$X[i], data$Y[i])

run_boot <- function(B) {
  bX <- boot(dat, stat_mean_X, R = B)
  bY <- boot(dat, stat_mean_Y, R = B)
  bC <- boot(dat, stat_cor,    R = B)

  get_perc <- function(bo) {
    ci <- boot.ci(bo, type = "perc")$percent
    c(estimate = bo$t0,
      ci_lower = ci[4],
      ci_upper = ci[5],
      se       = sd(bo$t))
  }

  rbind(
    data.frame(B = B, parameter = "mean(X)",  as.list(get_perc(bX))),
    data.frame(B = B, parameter = "mean(Y)",  as.list(get_perc(bY))),
    data.frame(B = B, parameter = "cor(X,Y)", as.list(get_perc(bC)))
  )
}

B_vals <- c(500, 1000, 5000)
boot_results <- do.call(rbind, lapply(B_vals, run_boot))

knitr::kable(boot_results, digits = 4,
             caption = "Percentile bootstrap 95% CIs from boot::boot.ci()")

Percentile bootstrap 95% CIs from boot::boot.ci()
B	parameter	estimate	ci_lower	ci_upper	se
500	mean(X)	4.6204	4.5933	4.6462	0.0132
500	mean(Y)	311.3710	297.4225	325.3042	6.9040
500	cor(X,Y)	-0.2306	-0.2857	-0.1702	0.0296
1000	mean(X)	4.6204	4.5957	4.6464	0.0129
1000	mean(Y)	311.3710	298.5682	324.5658	6.6771
1000	cor(X,Y)	-0.2306	-0.2888	-0.1712	0.0301
5000	mean(X)	4.6204	4.5959	4.6446	0.0126
5000	mean(Y)	311.3710	298.2781	324.6758	6.7425
5000	cor(X,Y)	-0.2306	-0.2883	-0.1730	0.0291

Bootstrap sampling distributions at \(B = 5000\)

B <- 5000
boot_mX <- boot(dat, stat_mean_X, R = B)
boot_mY <- boot(dat, stat_mean_Y, R = B)
boot_cr <- boot(dat, stat_cor,    R = B)

boot_df <- data.frame(
  value     = c(boot_mX$t, boot_mY$t, boot_cr$t),
  parameter = rep(c("mean(X)", "mean(Y)", "cor(X,Y)"), each = B)
)
ci_df <- data.frame(
  parameter = c("mean(X)", "mean(X)", "mean(Y)", "mean(Y)",
                "cor(X,Y)", "cor(X,Y)"),
  endpoint  = c(ci_mean_X, ci_mean_Y, ci_corr)
)

ggplot(boot_df, aes(x = value)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  geom_vline(data = ci_df, aes(xintercept = endpoint),
             colour = "red", linetype = "dashed", linewidth = 0.7) +
  facet_wrap(~ parameter, scales = "free", nrow = 1) +
  labs(title = "Bootstrap sampling distributions (B = 5000)",
       subtitle = "Red dashed lines: theoretical 95% CI endpoints from Part 2",
       x = NULL, y = "Count") +
  theme_minimal()

The red dashed lines mark the theoretical CI endpoints from Part 2; they sit right on the edges of the bootstrap histograms.

Part 4 — Comparing theoretical and bootstrap intervals

compare <- rbind(
  data.frame(parameter = theory_tbl$parameter,
             method    = "Theoretical",
             B         = NA_integer_,
             estimate  = theory_tbl$estimate,
             ci_lower  = theory_tbl$ci_lower,
             ci_upper  = theory_tbl$ci_upper),
  data.frame(parameter = boot_results$parameter,
             method    = "Bootstrap",
             B         = boot_results$B,
             estimate  = boot_results$estimate,
             ci_lower  = boot_results$ci_lower,
             ci_upper  = boot_results$ci_upper)
)
compare <- compare[order(compare$parameter, compare$method, compare$B), ]
rownames(compare) <- NULL
knitr::kable(compare, digits = 4,
             caption = "Theoretical vs. bootstrap 95% CIs")

Theoretical vs. bootstrap 95% CIs
parameter	method	B	estimate	ci_lower	ci_upper
cor(X,Y)	Bootstrap	500	-0.2306	-0.2857	-0.1702
cor(X,Y)	Bootstrap	1000	-0.2306	-0.2888	-0.1712
cor(X,Y)	Bootstrap	5000	-0.2306	-0.2883	-0.1730
cor(X,Y)	Theoretical	NA	-0.2306	-0.2885	-0.1711
mean(X)	Bootstrap	500	4.6204	4.5933	4.6462
mean(X)	Bootstrap	1000	4.6204	4.5957	4.6464
mean(X)	Bootstrap	5000	4.6204	4.5959	4.6446
mean(X)	Theoretical	NA	4.6204	4.5954	4.6454
mean(Y)	Bootstrap	500	311.3710	297.4225	325.3042
mean(Y)	Bootstrap	1000	311.3710	298.5682	324.5658
mean(Y)	Bootstrap	5000	311.3710	298.2781	324.6758
mean(Y)	Theoretical	NA	311.3710	297.9960	324.7460

plot_df <- compare
plot_df$label <- ifelse(is.na(plot_df$B), "Theoretical",
                        paste0("Boot, B=", plot_df$B))
plot_df$label <- factor(plot_df$label,
                        levels = c("Theoretical", "Boot, B=500",
                                   "Boot, B=1000", "Boot, B=5000"))

ggplot(plot_df, aes(x = label, y = estimate)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  facet_wrap(~ parameter, scales = "free_y") +
  labs(title = "95% CIs: theoretical vs. bootstrap at three values of B",
       x = NULL, y = "Estimate ± 95% CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 25, hjust = 1))

Effect of \(B\). Increasing the number of bootstrap replicates \(B\) shrinks the Monte-Carlo error in the estimated quantiles but does not change the asymptotic target the bootstrap is approximating. \(B = 500\) already agrees reasonably with the theoretical intervals, but the 2.5% and 97.5% quantiles are themselves noisy because each is determined by only \(\approx 0.025 B = 12\) replicates in the tails. By \(B = 1000\) the endpoints stabilise to two decimal places, and by \(B = 5000\) they are indistinguishable from the theoretical CIs for the two means, and very close for the correlation. A bigger \(B\) does not reduce statistical uncertainty (that is fixed by \(n\)) — it only reduces simulation error.

Does the normality of \(X\) and \(Y\) matter for the bootstrap?

For the means, the theoretical \(t\)-CI relies on the CLT applied to \(\bar X\) and \(\bar Y\), not on the marginal normality of \(X\) and \(Y\). With \(n = 1000\) the CLT is essentially exact, and the bootstrap matches the \(t\)-interval to four decimal places even though both \(X\) and \(Y\) are strongly non-normal. So the bootstrap is robust to non-normality for the mean.
For the correlation, Fisher’s \(z\) CI is strictly derived under the assumption that \((X, Y)\) is bivariate normal. When that assumption fails (as here), the nominal 95% coverage of the theoretical CI is no longer guaranteed. The bootstrap CI for the correlation, by contrast, makes no parametric assumption — it estimates the sampling distribution from the data themselves. Both intervals look similar in this large-\(n\) example, but in general the bootstrap is the more trustworthy of the two when \(X\) and \(Y\) are non-normal.

So normality affects what the theoretical interval is approximating, more than it affects the bootstrap. Bootstrap accuracy depends mainly on \(n\) (so that the empirical CDF is close to the true CDF) and on the smoothness of the statistic, not on the marginal shape of \(X\) and \(Y\).

Part 5 — What happens for small \(n\) (\(n < 100\))?

We draw a small subsample and rerun both procedures.

set.seed(7)
small_idx <- sample.int(n, 50)
small_dat <- dat[small_idx, ]
small_n   <- nrow(small_dat)

ci_mX_s <- as.numeric(t.test(small_dat$X)$conf.int)
ci_mY_s <- as.numeric(t.test(small_dat$Y)$conf.int)
ci_r_s  <- as.numeric(cor.test(small_dat$X, small_dat$Y)$conf.int)

sbX <- boot(small_dat, stat_mean_X, R = 5000)
sbY <- boot(small_dat, stat_mean_Y, R = 5000)
sbC <- boot(small_dat, stat_cor,    R = 5000)

perc <- function(bo) boot.ci(bo, type = "perc")$percent[4:5]

small_compare <- data.frame(
  parameter    = c("mean(X)", "mean(Y)", "cor(X,Y)"),
  theory_lower = c(ci_mX_s[1], ci_mY_s[1], ci_r_s[1]),
  theory_upper = c(ci_mX_s[2], ci_mY_s[2], ci_r_s[2]),
  boot_lower   = c(perc(sbX)[1], perc(sbY)[1], perc(sbC)[1]),
  boot_upper   = c(perc(sbX)[2], perc(sbY)[2], perc(sbC)[2])
)
knitr::kable(small_compare, digits = 4,
             caption = "n = 50: theoretical vs. percentile-bootstrap CIs")

n = 50: theoretical vs. percentile-bootstrap CIs
parameter	theory_lower	theory_upper	boot_lower	boot_upper
mean(X)	4.4689	4.7271	4.4740	4.7300
mean(Y)	289.3673	414.1527	290.7850	412.3945
cor(X,Y)	-0.4545	0.0812	-0.4327	0.0359

Expectations and observations for small \(n\).

The bootstrap relies on the empirical CDF \(\hat F_n\) being a good substitute for the true CDF \(F\). For small \(n\), \(\hat F_n\) is a coarse, jagged approximation, so the bootstrap sampling distribution is itself a poor approximation of the true one. Coverage of the percentile bootstrap drops below the nominal 95%, and the intervals tend to be too narrow.
The percentile interval also inherits any bias and skewness in the statistic. For the correlation, the small-sample sampling distribution is noticeably skewed, and the simple percentile interval under-covers. More refined variants — BCa or studentized bootstrap, both available in boot::boot.ci() via type = "bca" or type = "stud" — correct for bias and skewness and are preferred at small \(n\).
When the underlying \(X\) or \(Y\) are heavy-tailed or strongly skewed, the problem is worse: rare extreme values may be absent from the original sample altogether, so no resampling scheme can recover their influence. This is where non-normality starts to matter for the bootstrap, and it is precisely the regime where the theoretical \(t\)-interval — which only assumes a finite variance and the CLT — can actually do better than the bootstrap for the mean.
Rule of thumb: the simple percentile bootstrap is reliable for the mean once \(n\) is in the low hundreds; for the correlation, prefer BCa, especially when \(|\rho|\) is large or the joint distribution is far from bivariate normal.

In short, the bootstrap is an asymptotic procedure. It is excellent at \(n = 1000\), mediocre at \(n < 100\), and increasingly unreliable as \(n\) shrinks or as the data become more skewed/heavy-tailed.

Bootstrap Confidence Intervals for Means and Correlation

Sy Cong Nguyen

May 13, 2026

Part 1 — Real-world data and normality assessment

Visual assessment with ggplot2

Formal tests and shape measures

Part 2 — Theoretical 95% confidence intervals

Part 3 — Bootstrap 95% confidence intervals via the `boot` package

Bootstrap sampling distributions at \(B = 5000\)

Part 4 — Comparing theoretical and bootstrap intervals

Part 5 — What happens for small \(n\) (\(n < 100\))?

Bootstrap Confidence Intervals for Means and Correlation

Sy Cong Nguyen

May 13, 2026

Part 1 — Real-world data and normality assessment

Visual assessment with ggplot2

Formal tests and shape measures

Part 2 — Theoretical 95% confidence intervals

Part 3 — Bootstrap 95% confidence intervals via the boot package

Bootstrap sampling distributions at \(B = 5000\)

Part 4 — Comparing theoretical and bootstrap intervals

Part 5 — What happens for small \(n\) (\(n < 100\))?

Part 3 — Bootstrap 95% confidence intervals via the `boot` package