This document explores two methods for estimating the confidence interval of a correlation coefficient:
boot
package# Install if not already installed
if (!requireNamespace("MASS", quietly = TRUE)) install.packages("MASS")
if (!requireNamespace("boot", quietly = TRUE)) install.packages("boot")
library(MASS) # For generating bivariate normal samples
library(boot) # For bootstrap resampling
library(ggplot2) # For visualization
library(MASS)
set.seed(123) # for reproducibility
# Simulation parameters
nSims <- 10000 # number of simulation runs
n <- 200 # sample size per simulation
rho <- 0.5 # true correlation
alpha <- 0.05 # significance level
zCrit <- qnorm(1 - alpha / 2) # critical value for the normal distribution
coverageCount <- 0
# Covariance matrix for bivariate normal
Sigma <- matrix(c(1, rho, rho, 1), nrow = 2)
# Vectors to store sample correlations and last-run CI
rHatAll <- numeric(nSims)
rhoLowerAll <- numeric(nSims)
rhoUpperAll <- numeric(nSims)
for (i in seq_len(nSims)) {
# 1) Simulate bivariate normal data
data <- mvrnorm(n = n, mu = c(0, 0), Sigma = Sigma)
x <- data[, 1]
y <- data[, 2]
# 2) Sample correlation
rHat <- cor(x, y)
rHatAll[i] <- rHat # store it
# 3) Fisher's z-transformation
zHat <- atanh(rHat)
# 4) Approx. standard error for z
se <- 1 / sqrt(n - 3)
# 5) Confidence bounds for z
zLower <- zHat - zCrit * se
zUpper <- zHat + zCrit * se
# 6) Transform back to correlation scale
rhoLower <- tanh(zLower)
rhoUpper <- tanh(zUpper)
rhoLowerAll[i] <- rhoLower
rhoUpperAll[i] <- rhoUpper
# 7) Check if the true rho is within this CI
if (rho >= rhoLower && rho <= rhoUpper) {
coverageCount <- coverageCount + 1
}
}
# Overall coverage probability
coverageProb <- coverageCount / nSims
cat("Approximate coverage probability:", coverageProb, "\n")
## Approximate coverage probability: 0.9507
# --- Plot a histogram of all sample correlations ---
hist(rHatAll,
main = "Distribution of Sample Correlations",
xlab = "Sample Correlation",
col = "lightgray", breaks = 20, xlim = c(0.2, 0.8))
# Add vertical line for the true correlation
abline(v = rho, col = "red", lwd = 2, lty = 2)
# Add lines for the *last iteration's* confidence interval
abline(v = c(rhoLowerAll[nSims], rhoUpperAll[nSims]),
col = "blue", lwd = 2, lty = 2)
set.seed(223)
# --- Simulation settings ---
B <- 1000 # Number of bootstrap resamples
# Covariance matrix for bivariate normal with correlation rho
Sigma <- matrix(c(1, rho, rho, 1), nrow = 2)
coverageCount <- 0
rBootLast <- NULL # Will store the bootstrap distribution from the final iteration
lowerLast <- NA
upperLast <- NA
# 1) Generate a sample from the bivariate normal
dataSim <- mvrnorm(n, mu = c(0, 0), Sigma = Sigma)
# 2) Compute the original sample correlation
rHat <- cor(dataSim[, 1], dataSim[, 2])
# 3) Nonparametric bootstrap
rBoot <- numeric(B)
for (b in seq_len(B)) {
# Resample with replacement
idx <- sample(seq_len(n), size = n, replace = TRUE)
dataB <- dataSim[idx, ]
rBoot[b] <- cor(dataB[, 1], dataB[, 2])
}
# 4) Build a percentile-based CI from the bootstrap distribution
lower <- quantile(rBoot, probs = alpha / 2)
upper <- quantile(rBoot, probs = 1 - alpha / 2)
# 5) Check if the CI captures the true rho
if (rho >= lower && rho <= upper) {
coverageCount <- coverageCount + 1
}
# --- Estimated coverage probability ---
coverage <- coverageCount / nSims
cat("Estimated coverage probability:", coverage, "\n")
## Estimated coverage probability: 1e-04
# --- 6) Plot the histogram of bootstrap correlations from the last iteration ---
hist(rBoot,
main = "Bootstrap Distribution of Correlation",
xlab = "Sample Correlation", col = "lightgray", breaks = 20, xlim = c(0.2, 0.8))
abline(v = rho, col = "red", lwd = 2, lty = 2) # true correlation
abline(v = c(lowerLast, upperLast), col = "blue", lwd = 2, lty = 2) # CI bounds
boot
Package# Function for bootstrapping correlation
boot_corr <- function(data, indices) {
sample_data <- data[indices, ]
return(cor(sample_data[, 1], sample_data[, 2]))
}
# Apply bootstrapping
boot_result <- boot(data = dataSim, statistic = boot_corr, R = B)
# Compute percentile CI
boot_ci <- boot.ci(boot_result, type = "perc")
cat("Bootstrapped CI from `boot` package: [", boot_ci$percent[4], ",", boot_ci$percent[5], "]\n")
## Bootstrapped CI from `boot` package: [ 0.4210667 , 0.6285941 ]
# Create a combined dataframe for plotting
hist(rHatAll,
main = "Comparison of Parametric & Bootstrap Distributions",
xlab = "Sample Correlation",
col = adjustcolor("gray", alpha.f = 0.6), # Slight transparency
breaks = 25, xlim = c(0.2, 0.8), freq = FALSE)
hist(rBoot,
col = adjustcolor("blue", alpha.f = 0.5),
breaks = 25, freq = FALSE, add = TRUE) # Overlay bootstrap histogram
# Add vertical line for the true correlation
abline(v = rho, col = "red", lwd = 2, lty = 2)
# Add CI bounds for the last iteration
abline(v = c(lowerLast, upperLast), col = "blue", lwd = 2, lty = 2)