Bootstrapping and Parametric CI Estimation for Correlation

Introduction

This document explores two methods for estimating the confidence interval of a correlation coefficient:

Parametric method using Fisher’s \(z\)-transformation
Bootstrapping method using both manual resampling and the boot package

Load Packages

# Install if not already installed
if (!requireNamespace("MASS", quietly = TRUE)) install.packages("MASS")
if (!requireNamespace("boot", quietly = TRUE)) install.packages("boot")

library(MASS)  # For generating bivariate normal samples
library(boot)  # For bootstrap resampling
library(ggplot2) # For visualization

Parametric CI Estimation

library(MASS)

set.seed(123)        # for reproducibility

# Simulation parameters
nSims <- 10000       # number of simulation runs
n <- 200              # sample size per simulation
rho <- 0.5           # true correlation
alpha <- 0.05        # significance level
zCrit <- qnorm(1 - alpha / 2)  # critical value for the normal distribution

coverageCount <- 0

# Covariance matrix for bivariate normal
Sigma <- matrix(c(1, rho, rho, 1), nrow = 2)

# Vectors to store sample correlations and last-run CI
rHatAll <- numeric(nSims)
rhoLowerAll <- numeric(nSims)
rhoUpperAll <- numeric(nSims)

for (i in seq_len(nSims)) {
  # 1) Simulate bivariate normal data
  data <- mvrnorm(n = n, mu = c(0, 0), Sigma = Sigma)
  x <- data[, 1]
  y <- data[, 2]
  
  # 2) Sample correlation
  rHat <- cor(x, y)
  rHatAll[i] <- rHat  # store it
  
  # 3) Fisher's z-transformation
  zHat <- atanh(rHat)
  
  # 4) Approx. standard error for z
  se <- 1 / sqrt(n - 3)
  
  # 5) Confidence bounds for z
  zLower <- zHat - zCrit * se
  zUpper <- zHat + zCrit * se
  
  # 6) Transform back to correlation scale
  rhoLower <- tanh(zLower)
  rhoUpper <- tanh(zUpper)
  rhoLowerAll[i] <- rhoLower
  rhoUpperAll[i] <- rhoUpper
  
  # 7) Check if the true rho is within this CI
  if (rho >= rhoLower && rho <= rhoUpper) {
    coverageCount <- coverageCount + 1
  }
}

# Overall coverage probability
coverageProb <- coverageCount / nSims
cat("Approximate coverage probability:", coverageProb, "\n")

## Approximate coverage probability: 0.9507

# --- Plot a histogram of all sample correlations ---
hist(rHatAll,
     main = "Distribution of Sample Correlations",
     xlab = "Sample Correlation",
     col = "lightgray", breaks = 20, xlim = c(0.2, 0.8))

# Add vertical line for the true correlation
abline(v = rho, col = "red", lwd = 2, lty = 2)

# Add lines for the *last iteration's* confidence interval
abline(v = c(rhoLowerAll[nSims], rhoUpperAll[nSims]),
       col = "blue", lwd = 2, lty = 2)

Bootstrapping for CI Estimation (Manual Approach)

set.seed(223)  

# --- Simulation settings ---
B     <- 1000    # Number of bootstrap resamples

# Covariance matrix for bivariate normal with correlation rho
Sigma <- matrix(c(1, rho, rho, 1), nrow = 2)

coverageCount <- 0
rBootLast     <- NULL  # Will store the bootstrap distribution from the final iteration
lowerLast     <- NA
upperLast     <- NA


# 1) Generate a sample from the bivariate normal
dataSim <- mvrnorm(n, mu = c(0, 0), Sigma = Sigma)

# 2) Compute the original sample correlation
rHat <- cor(dataSim[, 1], dataSim[, 2])

# 3) Nonparametric bootstrap
rBoot <- numeric(B)
for (b in seq_len(B)) {
  # Resample with replacement
  idx <- sample(seq_len(n), size = n, replace = TRUE)
  dataB <- dataSim[idx, ]
  rBoot[b] <- cor(dataB[, 1], dataB[, 2])
}

# 4) Build a percentile-based CI from the bootstrap distribution
lower <- quantile(rBoot, probs = alpha / 2)
upper <- quantile(rBoot, probs = 1 - alpha / 2)

# 5) Check if the CI captures the true rho
if (rho >= lower && rho <= upper) {
  coverageCount <- coverageCount + 1
}


# --- Estimated coverage probability ---
coverage <- coverageCount / nSims
cat("Estimated coverage probability:", coverage, "\n")

## Estimated coverage probability: 1e-04

# --- 6) Plot the histogram of bootstrap correlations from the last iteration ---
hist(rBoot,
     main = "Bootstrap Distribution of Correlation",
     xlab = "Sample Correlation", col = "lightgray", breaks = 20, xlim = c(0.2, 0.8))
abline(v = rho, col = "red", lwd = 2, lty = 2)          # true correlation
abline(v = c(lowerLast, upperLast), col = "blue", lwd = 2, lty = 2)  # CI bounds

Bootstrapping Using the `boot` Package

# Function for bootstrapping correlation
boot_corr <- function(data, indices) {
  sample_data <- data[indices, ]
  return(cor(sample_data[, 1], sample_data[, 2]))
}

# Apply bootstrapping
boot_result <- boot(data = dataSim, statistic = boot_corr, R = B)

# Compute percentile CI
boot_ci <- boot.ci(boot_result, type = "perc")
cat("Bootstrapped CI from `boot` package: [", boot_ci$percent[4], ",", boot_ci$percent[5], "]\n")

## Bootstrapped CI from `boot` package: [ 0.4210667 , 0.6285941 ]

Visualization: Comparing Parametric and Bootstrap Estimates

# Create a combined dataframe for plotting
hist(rHatAll,
     main = "Comparison of Parametric & Bootstrap Distributions",
     xlab = "Sample Correlation",
     col = adjustcolor("gray", alpha.f = 0.6), # Slight transparency
     breaks = 25, xlim = c(0.2, 0.8), freq = FALSE)

hist(rBoot, 
     col = adjustcolor("blue", alpha.f = 0.5), 
     breaks = 25, freq = FALSE, add = TRUE)  # Overlay bootstrap histogram

# Add vertical line for the true correlation
abline(v = rho, col = "red", lwd = 2, lty = 2)

# Add CI bounds for the last iteration
abline(v = c(lowerLast, upperLast), col = "blue", lwd = 2, lty = 2)

Bootstrapping and Parametric CI Estimation for Correlation

Miao Yu

2025-02-11

Introduction

Load Packages

Parametric CI Estimation

Bootstrapping for CI Estimation (Manual Approach)

Bootstrapping Using the `boot` Package

Visualization: Comparing Parametric and Bootstrap Estimates

Bootstrapping and Parametric CI Estimation for Correlation

Miao Yu

2025-02-11

Introduction

Load Packages

Parametric CI Estimation

Bootstrapping for CI Estimation (Manual Approach)

Bootstrapping Using the boot Package

Visualization: Comparing Parametric and Bootstrap Estimates

Bootstrapping Using the `boot` Package