if (!requireNamespace("MASS",    quietly = TRUE)) install.packages("MASS")
if (!requireNamespace("boot",    quietly = TRUE)) install.packages("boot")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")

library(MASS) 
library(boot)
library(ggplot2)

Part 1 — Real-world data and normality assessment

We use the built-in quakes dataset (R’s datasets package), which records \(n = 1000\) seismic events near Fiji since 1964. It is real-world observational data (not a simulation). We take

Both variables are continuous.

data(quakes)
dat <- quakes[, c("mag", "depth")]
names(dat) <- c("X", "Y")
n <- nrow(dat)
n
## [1] 1000
summary(dat)
##        X              Y        
##  Min.   :4.00   Min.   : 40.0  
##  1st Qu.:4.30   1st Qu.: 99.0  
##  Median :4.60   Median :247.0  
##  Mean   :4.62   Mean   :311.4  
##  3rd Qu.:4.90   3rd Qu.:543.0  
##  Max.   :6.40   Max.   :680.0

Visual assessment with ggplot2

p1 <- ggplot(dat, aes(x = X)) +
  geom_histogram(bins = 30, fill = "steelblue", colour = "white") +
  labs(title = "Histogram of X (mag)", x = "Richter magnitude", y = "Count") +
  theme_minimal()

p2 <- ggplot(dat, aes(x = Y)) +
  geom_histogram(bins = 30, fill = "tomato", colour = "white") +
  labs(title = "Histogram of Y (depth)", x = "Depth (km)", y = "Count") +
  theme_minimal()

p3 <- ggplot(dat, aes(sample = X)) +
  stat_qq(colour = "steelblue") + stat_qq_line() +
  labs(title = "Q-Q plot of X (mag)",
       x = "Theoretical quantiles", y = "Sample quantiles") +
  theme_minimal()

p4 <- ggplot(dat, aes(sample = Y)) +
  stat_qq(colour = "tomato") + stat_qq_line() +
  labs(title = "Q-Q plot of Y (depth)",
       x = "Theoretical quantiles", y = "Sample quantiles") +
  theme_minimal()

if (requireNamespace("gridExtra", quietly = TRUE)) {
  gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
} else {
  print(p1); print(p2); print(p3); print(p4)
}

Formal tests and shape measures

sw_X <- shapiro.test(dat$X)
sw_Y <- shapiro.test(dat$Y)

skewness <- function(z) mean((z - mean(z))^3) / sd(z)^3
kurtosis <- function(z) mean((z - mean(z))^4) / sd(z)^4 - 3

shape_tbl <- data.frame(
  variable    = c("X (mag)", "Y (depth)"),
  mean        = c(mean(dat$X), mean(dat$Y)),
  sd          = c(sd(dat$X),   sd(dat$Y)),
  skewness    = c(skewness(dat$X), skewness(dat$Y)),
  ex_kurtosis = c(kurtosis(dat$X), kurtosis(dat$Y)),
  shapiro_W   = c(sw_X$statistic, sw_Y$statistic),
  shapiro_p   = c(sw_X$p.value,   sw_Y$p.value)
)
knitr::kable(shape_tbl, digits = 4,
             caption = "Normality diagnostics for X and Y")
Normality diagnostics for X and Y
variable mean sd skewness ex_kurtosis shapiro_W shapiro_p
X (mag) 4.6204 0.4028 0.7674 0.5033 0.9538 0
Y (depth) 311.3710 215.5355 0.1979 -1.5980 0.8698 0

Conclusion of Part 1. The Q-Q plots show clear departures from normality in both variables. \(X\) (magnitude) is right-skewed and somewhat discretised because magnitudes are reported to one decimal place. \(Y\) (depth) is strongly bimodal — shallow events near 50 km coexist with deep-focus events near 500 km — and is nowhere near Gaussian. Shapiro–Wilk rejects normality at any conventional level for both variables. So neither \(X\) nor \(Y\) is normally distributed.


Part 2 — Theoretical 95% confidence intervals

For the means we use the Student-\(t\) interval

\[ \bar{x} \;\pm\; t_{n-1,\,0.975}\,\frac{s}{\sqrt{n}}. \]

For the Pearson correlation \(\rho\) we use Fisher’s \(z\) transformation:

\[ z = \tfrac{1}{2}\log\!\frac{1+r}{1-r}, \qquad z \sim \mathcal{N}\!\left(\tfrac{1}{2}\log\tfrac{1+\rho}{1-\rho},\;\tfrac{1}{n-3}\right), \]

so the 95% CI for \(\rho\) is obtained by inverting the \(z\)-interval \(z \pm 1.96/\sqrt{n-3}\).

ci_mean_X <- as.numeric(t.test(dat$X)$conf.int)
ci_mean_Y <- as.numeric(t.test(dat$Y)$conf.int)

ct      <- cor.test(dat$X, dat$Y)
r       <- unname(ct$estimate)
ci_corr <- as.numeric(ct$conf.int)

theory_tbl <- data.frame(
  parameter = c("mean(X)", "mean(Y)", "cor(X,Y)"),
  estimate  = c(mean(dat$X), mean(dat$Y), r),
  ci_lower  = c(ci_mean_X[1], ci_mean_Y[1], ci_corr[1]),
  ci_upper  = c(ci_mean_X[2], ci_mean_Y[2], ci_corr[2])
)
knitr::kable(theory_tbl, digits = 4,
             caption = "Theoretical 95% confidence intervals")
Theoretical 95% confidence intervals
parameter estimate ci_lower ci_upper
mean(X) 4.6204 4.5954 4.6454
mean(Y) 311.3710 297.9960 324.7460
cor(X,Y) -0.2306 -0.2885 -0.1711

Part 3 — Bootstrap 95% confidence intervals via the boot package

The boot package needs a statistic with signature function(data, indices). We define one statistic per quantity of interest, call boot() to draw the resamples, then boot.ci(type = "perc") for the percentile CI.

stat_mean_X <- function(data, i) mean(data$X[i])
stat_mean_Y <- function(data, i) mean(data$Y[i])
stat_cor    <- function(data, i) cor(data$X[i], data$Y[i])
run_boot <- function(B) {
  bX <- boot(dat, stat_mean_X, R = B)
  bY <- boot(dat, stat_mean_Y, R = B)
  bC <- boot(dat, stat_cor,    R = B)

  get_perc <- function(bo) {
    ci <- boot.ci(bo, type = "perc")$percent
    c(estimate = bo$t0,
      ci_lower = ci[4],
      ci_upper = ci[5],
      se       = sd(bo$t))
  }

  rbind(
    data.frame(B = B, parameter = "mean(X)",  as.list(get_perc(bX))),
    data.frame(B = B, parameter = "mean(Y)",  as.list(get_perc(bY))),
    data.frame(B = B, parameter = "cor(X,Y)", as.list(get_perc(bC)))
  )
}

B_vals <- c(500, 1000, 5000)
boot_results <- do.call(rbind, lapply(B_vals, run_boot))

knitr::kable(boot_results, digits = 4,
             caption = "Percentile bootstrap 95% CIs from boot::boot.ci()")
Percentile bootstrap 95% CIs from boot::boot.ci()
B parameter estimate ci_lower ci_upper se
500 mean(X) 4.6204 4.5933 4.6462 0.0132
500 mean(Y) 311.3710 297.4225 325.3042 6.9040
500 cor(X,Y) -0.2306 -0.2857 -0.1702 0.0296
1000 mean(X) 4.6204 4.5957 4.6464 0.0129
1000 mean(Y) 311.3710 298.5682 324.5658 6.6771
1000 cor(X,Y) -0.2306 -0.2888 -0.1712 0.0301
5000 mean(X) 4.6204 4.5959 4.6446 0.0126
5000 mean(Y) 311.3710 298.2781 324.6758 6.7425
5000 cor(X,Y) -0.2306 -0.2883 -0.1730 0.0291

Bootstrap sampling distributions at \(B = 5000\)

B <- 5000
boot_mX <- boot(dat, stat_mean_X, R = B)
boot_mY <- boot(dat, stat_mean_Y, R = B)
boot_cr <- boot(dat, stat_cor,    R = B)

boot_df <- data.frame(
  value     = c(boot_mX$t, boot_mY$t, boot_cr$t),
  parameter = rep(c("mean(X)", "mean(Y)", "cor(X,Y)"), each = B)
)
ci_df <- data.frame(
  parameter = c("mean(X)", "mean(X)", "mean(Y)", "mean(Y)",
                "cor(X,Y)", "cor(X,Y)"),
  endpoint  = c(ci_mean_X, ci_mean_Y, ci_corr)
)

ggplot(boot_df, aes(x = value)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  geom_vline(data = ci_df, aes(xintercept = endpoint),
             colour = "red", linetype = "dashed", linewidth = 0.7) +
  facet_wrap(~ parameter, scales = "free", nrow = 1) +
  labs(title = "Bootstrap sampling distributions (B = 5000)",
       subtitle = "Red dashed lines: theoretical 95% CI endpoints from Part 2",
       x = NULL, y = "Count") +
  theme_minimal()

The red dashed lines mark the theoretical CI endpoints from Part 2; they sit right on the edges of the bootstrap histograms.


Part 4 — Comparing theoretical and bootstrap intervals

compare <- rbind(
  data.frame(parameter = theory_tbl$parameter,
             method    = "Theoretical",
             B         = NA_integer_,
             estimate  = theory_tbl$estimate,
             ci_lower  = theory_tbl$ci_lower,
             ci_upper  = theory_tbl$ci_upper),
  data.frame(parameter = boot_results$parameter,
             method    = "Bootstrap",
             B         = boot_results$B,
             estimate  = boot_results$estimate,
             ci_lower  = boot_results$ci_lower,
             ci_upper  = boot_results$ci_upper)
)
compare <- compare[order(compare$parameter, compare$method, compare$B), ]
rownames(compare) <- NULL
knitr::kable(compare, digits = 4,
             caption = "Theoretical vs. bootstrap 95% CIs")
Theoretical vs. bootstrap 95% CIs
parameter method B estimate ci_lower ci_upper
cor(X,Y) Bootstrap 500 -0.2306 -0.2857 -0.1702
cor(X,Y) Bootstrap 1000 -0.2306 -0.2888 -0.1712
cor(X,Y) Bootstrap 5000 -0.2306 -0.2883 -0.1730
cor(X,Y) Theoretical NA -0.2306 -0.2885 -0.1711
mean(X) Bootstrap 500 4.6204 4.5933 4.6462
mean(X) Bootstrap 1000 4.6204 4.5957 4.6464
mean(X) Bootstrap 5000 4.6204 4.5959 4.6446
mean(X) Theoretical NA 4.6204 4.5954 4.6454
mean(Y) Bootstrap 500 311.3710 297.4225 325.3042
mean(Y) Bootstrap 1000 311.3710 298.5682 324.5658
mean(Y) Bootstrap 5000 311.3710 298.2781 324.6758
mean(Y) Theoretical NA 311.3710 297.9960 324.7460
plot_df <- compare
plot_df$label <- ifelse(is.na(plot_df$B), "Theoretical",
                        paste0("Boot, B=", plot_df$B))
plot_df$label <- factor(plot_df$label,
                        levels = c("Theoretical", "Boot, B=500",
                                   "Boot, B=1000", "Boot, B=5000"))

ggplot(plot_df, aes(x = label, y = estimate)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  facet_wrap(~ parameter, scales = "free_y") +
  labs(title = "95% CIs: theoretical vs. bootstrap at three values of B",
       x = NULL, y = "Estimate ± 95% CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 25, hjust = 1))

Effect of \(B\). Increasing the number of bootstrap replicates \(B\) shrinks the Monte-Carlo error in the estimated quantiles but does not change the asymptotic target the bootstrap is approximating. \(B = 500\) already agrees reasonably with the theoretical intervals, but the 2.5% and 97.5% quantiles are themselves noisy because each is determined by only \(\approx 0.025 B = 12\) replicates in the tails. By \(B = 1000\) the endpoints stabilise to two decimal places, and by \(B = 5000\) they are indistinguishable from the theoretical CIs for the two means, and very close for the correlation. A bigger \(B\) does not reduce statistical uncertainty (that is fixed by \(n\)) — it only reduces simulation error.

Does the normality of \(X\) and \(Y\) matter for the bootstrap?

So normality affects what the theoretical interval is approximating, more than it affects the bootstrap. Bootstrap accuracy depends mainly on \(n\) (so that the empirical CDF is close to the true CDF) and on the smoothness of the statistic, not on the marginal shape of \(X\) and \(Y\).


Part 5 — What happens for small \(n\) (\(n < 100\))?

We draw a small subsample and rerun both procedures.

set.seed(7)
small_idx <- sample.int(n, 50)
small_dat <- dat[small_idx, ]
small_n   <- nrow(small_dat)

ci_mX_s <- as.numeric(t.test(small_dat$X)$conf.int)
ci_mY_s <- as.numeric(t.test(small_dat$Y)$conf.int)
ci_r_s  <- as.numeric(cor.test(small_dat$X, small_dat$Y)$conf.int)

sbX <- boot(small_dat, stat_mean_X, R = 5000)
sbY <- boot(small_dat, stat_mean_Y, R = 5000)
sbC <- boot(small_dat, stat_cor,    R = 5000)

perc <- function(bo) boot.ci(bo, type = "perc")$percent[4:5]

small_compare <- data.frame(
  parameter    = c("mean(X)", "mean(Y)", "cor(X,Y)"),
  theory_lower = c(ci_mX_s[1], ci_mY_s[1], ci_r_s[1]),
  theory_upper = c(ci_mX_s[2], ci_mY_s[2], ci_r_s[2]),
  boot_lower   = c(perc(sbX)[1], perc(sbY)[1], perc(sbC)[1]),
  boot_upper   = c(perc(sbX)[2], perc(sbY)[2], perc(sbC)[2])
)
knitr::kable(small_compare, digits = 4,
             caption = "n = 50: theoretical vs. percentile-bootstrap CIs")
n = 50: theoretical vs. percentile-bootstrap CIs
parameter theory_lower theory_upper boot_lower boot_upper
mean(X) 4.4689 4.7271 4.4740 4.7300
mean(Y) 289.3673 414.1527 290.7850 412.3945
cor(X,Y) -0.4545 0.0812 -0.4327 0.0359

Expectations and observations for small \(n\).

In short, the bootstrap is an asymptotic procedure. It is excellent at \(n = 1000\), mediocre at \(n < 100\), and increasingly unreliable as \(n\) shrinks or as the data become more skewed/heavy-tailed.