Dataset testing for functions made in this RMD File
data <- read.csv("C:/Users/zahir/OneDrive/Desktop/Rutgers Semester Folders/Spring 2025/Applied Statistical Learning/Assignment 2/NOAA+GISS 2024.csv")
head(data)
## year X.disaster delta.temp
## 1 1980 3 0.27
## 2 1981 1 0.33
## 3 1982 3 0.13
## 4 1983 5 0.30
## 5 1984 2 0.16
## 6 1985 5 0.12
1. Building the 2nd order pivotal confidence interval estimator
for the mean based on the bootstrap (using 10,000 = nboot)
pivotal_ci_interval_estim <- function(data, nboot = 10000, alpha = 0.1) {
n <- length(data) #our number of observations in data stored into n
meanθ <- mean(data) #our sample mean (estimate of population mean)
sdθ <- sd(data) #our sample standard deviation
#initializing our vector to contain nboot length number of t-statistics per resample
bootvec <- numeric(nboot)
#starting our loop to run nboot times (bootstrap resampling)
boot_t_stats <- replicate(nboot, {
resample <- sample(data, size = n, replace = TRUE)
(mean(resample) - meanθ) / (sd(resample) / sqrt(n))
})
#quantiles for bootstrap pivotal interval
lq <- quantile(boot_t_stats, alpha / 2)
uq <- quantile(boot_t_stats, 1 - alpha / 2)
#bootstrap pivotal CI
bootstrap_ci <- c(
meanθ - (sdθ / sqrt(n)) * uq,
meanθ - (sdθ / sqrt(n)) * lq
)
#computing norm-based t-distrib CI
t_factor <- qt(1 - alpha / 2, df = n - 1)
normal_ci <- c(
meanθ - (sdθ / sqrt(n)) * t_factor,
meanθ + (sdθ / sqrt(n)) * t_factor
)
list(
bootstrap_ci = bootstrap_ci,
normal_ci = normal_ci
)
}
Testing pivotal_ci_interval_estim function
NOAA_temp_ci_result <- pivotal_ci_interval_estim(data$delta.temp, nboot = 10000)
NOAA_temp_ci_result
## $bootstrap_ci
## 95% 5%
## 0.5032915 0.6474180
##
## $normal_ci
## [1] 0.5006628 0.6437816
2. Building a sim that draws n samples
Testing Simfunc
Simfunc()
## $boot.coverage
## [1] 0.849
##
## $norm.coverage
## [1] 0.811
3. Building boostrap estimator into simulator
which compares how well the 2nd order Pivotal bootstrap confidence
interval covers the true value vs a normal theory confidence interval
running it for lognormal (nsim = 1000) for n = 3, 10, 30, 100, alpha =
0.1, 0.05 (for 8 runs total into a table)
sim_boot_v_norm <- function(mu.val = 3, n_vals = c(3, 10, 30, 100), nsim = 1000, alpha_vals = c(0.1, 0.05), nboot = 1000) {
results_list <- vector("list", length(n_vals) * length(alpha_vals))
index <- 1
for (n in n_vals) {
for (alpha in alpha_vals) {
mulnorm <- exp(mu.val + 0.5)
# running the simulation (replicate returns a 2 x nsim matrix)
sim_results <- replicate(nsim, {
sample_data <- rlnorm(n, mu.val)
ci <- pivotal_ci_interval_estim(sample_data, nboot = nboot, alpha = alpha)
boot.conf <- ci$bootstrap_ci
norm.conf <- ci$normal_ci
c(boot = as.numeric(boot.conf[1] < mulnorm & boot.conf[2] > mulnorm),
norm = as.numeric(norm.conf[1] < mulnorm & norm.conf[2] > mulnorm))
})
boot_coverage <- mean(sim_results["boot", ])
norm_coverage <- mean(sim_results["norm", ])
results_list[[index]] <- c(
n = n,
alpha = alpha,
bootstrap_coverage = boot_coverage,
normal_coverage = norm_coverage,
nominal_coverage = 1 - alpha)
index <- index + 1
}
}
results_df <- as.data.frame(do.call(rbind, results_list))
colnames(results_df) <- c("n", "alpha", "bootstrap_coverage", "normal_coverage", "nominal_coverage")
return(results_df)
}
Testing sim_boot_v_norm
sim_boot_v_norm()
## n alpha bootstrap_coverage normal_coverage nominal_coverage
## 1 3 0.10 0.843 0.765 0.90
## 2 3 0.05 0.998 0.830 0.95
## 3 10 0.10 0.858 0.785 0.90
## 4 10 0.05 0.904 0.829 0.95
## 5 30 0.10 0.866 0.849 0.90
## 6 30 0.05 0.915 0.871 0.95
## 7 100 0.10 0.880 0.867 0.90
## 8 100 0.05 0.942 0.925 0.95
Due to the skewness in the NOAA data, the bootstrap method produced
more reliable confidence intervals compared to the normal method.
Bootstrapping does not assume normality and is better suited to handle
skewed data or smaller sample sizes. However, as the sample size
increases, the central limit theorem implies that the sampling
distribution of the mean will approach normality, making the
normal-based method increasingly reliable—if not superior—in larger
samples. While the normal method is robust when data is symmetric, its
reliability diminishes with small samples or skewed distributions,
conditions under which bootstrapping tends to perform better.