Dataset testing for functions made in this RMD File

data <- read.csv("C:/Users/zahir/OneDrive/Desktop/Rutgers Semester Folders/Spring 2025/Applied Statistical Learning/Assignment 2/NOAA+GISS 2024.csv")
head(data)
##   year X.disaster delta.temp
## 1 1980          3       0.27
## 2 1981          1       0.33
## 3 1982          3       0.13
## 4 1983          5       0.30
## 5 1984          2       0.16
## 6 1985          5       0.12

1. Building the 2nd order pivotal confidence interval estimator

for the mean based on the bootstrap (using 10,000 = nboot)

pivotal_ci_interval_estim <- function(data, nboot = 10000, alpha = 0.1) {
  n <- length(data) #our number of observations in data stored into n
  meanθ <- mean(data) #our sample mean (estimate of population mean)
  sdθ <- sd(data) #our sample standard deviation
  #initializing our vector to contain nboot length number of t-statistics per resample
  bootvec <- numeric(nboot)
  #starting our loop to run nboot times (bootstrap resampling)
  boot_t_stats <- replicate(nboot, {
    resample <- sample(data, size = n, replace = TRUE)
    (mean(resample) - meanθ) / (sd(resample) / sqrt(n))
  })
  #quantiles for bootstrap pivotal interval
  lq <- quantile(boot_t_stats, alpha / 2)
  uq <- quantile(boot_t_stats, 1 - alpha / 2)
  #bootstrap pivotal CI
  bootstrap_ci <- c(
    meanθ - (sdθ / sqrt(n)) * uq,
    meanθ - (sdθ / sqrt(n)) * lq
  )
  #computing norm-based t-distrib CI
  t_factor <- qt(1 - alpha / 2, df = n - 1)
  normal_ci <- c(
    meanθ - (sdθ / sqrt(n)) * t_factor,
    meanθ + (sdθ / sqrt(n)) * t_factor
  )
  list(
    bootstrap_ci = bootstrap_ci,
    normal_ci = normal_ci
  )
}

Testing pivotal_ci_interval_estim function

NOAA_temp_ci_result <- pivotal_ci_interval_estim(data$delta.temp, nboot = 10000)
NOAA_temp_ci_result
## $bootstrap_ci
##       95%        5% 
## 0.5032915 0.6474180 
## 
## $normal_ci
## [1] 0.5006628 0.6437816

2. Building a sim that draws n samples

form a lognormal distribution (rlnorm) and builds both the central limit theorem based confidence interval, and compares it to the coverage rate for the bootstrap (confidence interval based on the the bootstrap program). (1000 simulation runs minimum)

 Simfunc <- function(mu.val = 3, n = 30, nsim = 1000) {
   #true mean of the lognormal distrib
   mulnorm <- exp(mu.val + 0.5)
   #running simulation nsim times with replicate
   coverage_sim <- replicate(nsim, {
     samp_data <- rlnorm(n, mu.val)
     #storing vectors
     ci <- pivotal_ci_interval_estim(samp_data)
     boot.conf <- ci$bootstrap_ci
     norm.conf <- ci$normal_ci 
     c(boot = as.numeric(boot.conf[1] < mulnorm & boot.conf[2] > mulnorm),
       norm = as.numeric(norm.conf[1] < mulnorm & norm.conf[2] > mulnorm))
     })
   list(
     #coverage rates
     boot.coverage = mean(coverage_sim["boot", ]),
     norm.coverage = mean(coverage_sim["norm", ])
  )
}

Testing Simfunc

Simfunc()
## $boot.coverage
## [1] 0.849
## 
## $norm.coverage
## [1] 0.811

3. Building boostrap estimator into simulator

which compares how well the 2nd order Pivotal bootstrap confidence interval covers the true value vs a normal theory confidence interval running it for lognormal (nsim = 1000) for n = 3, 10, 30, 100, alpha = 0.1, 0.05 (for 8 runs total into a table)

sim_boot_v_norm <- function(mu.val = 3, n_vals = c(3, 10, 30, 100), nsim = 1000, alpha_vals = c(0.1, 0.05), nboot = 1000) {
  results_list <- vector("list", length(n_vals) * length(alpha_vals))
  index <- 1
  for (n in n_vals) {
    for (alpha in alpha_vals) {
      mulnorm <- exp(mu.val + 0.5)
      # running the simulation (replicate returns a 2 x nsim matrix)
      sim_results <- replicate(nsim, {
        sample_data <- rlnorm(n, mu.val)
        ci <- pivotal_ci_interval_estim(sample_data, nboot = nboot, alpha = alpha)
        boot.conf <- ci$bootstrap_ci
        norm.conf <- ci$normal_ci
        c(boot = as.numeric(boot.conf[1] < mulnorm & boot.conf[2] > mulnorm),
          norm = as.numeric(norm.conf[1] < mulnorm & norm.conf[2] > mulnorm))
      })
      boot_coverage <- mean(sim_results["boot", ])
      norm_coverage <- mean(sim_results["norm", ])
      
      results_list[[index]] <- c(
        n = n, 
        alpha = alpha, 
        bootstrap_coverage = boot_coverage, 
        normal_coverage = norm_coverage, 
        nominal_coverage = 1 - alpha)
      index <- index + 1
    }
  }
  results_df <- as.data.frame(do.call(rbind, results_list))
  colnames(results_df) <- c("n", "alpha", "bootstrap_coverage", "normal_coverage", "nominal_coverage")
  return(results_df)
}

Testing sim_boot_v_norm

sim_boot_v_norm()
##     n alpha bootstrap_coverage normal_coverage nominal_coverage
## 1   3  0.10              0.843           0.765             0.90
## 2   3  0.05              0.998           0.830             0.95
## 3  10  0.10              0.858           0.785             0.90
## 4  10  0.05              0.904           0.829             0.95
## 5  30  0.10              0.866           0.849             0.90
## 6  30  0.05              0.915           0.871             0.95
## 7 100  0.10              0.880           0.867             0.90
## 8 100  0.05              0.942           0.925             0.95

Due to the skewness in the NOAA data, the bootstrap method produced more reliable confidence intervals compared to the normal method. Bootstrapping does not assume normality and is better suited to handle skewed data or smaller sample sizes. However, as the sample size increases, the central limit theorem implies that the sampling distribution of the mean will approach normality, making the normal-based method increasingly reliable—if not superior—in larger samples. While the normal method is robust when data is symmetric, its reliability diminishes with small samples or skewed distributions, conditions under which bootstrapping tends to perform better.