Data Set:

Flights with X is the distance between the two location and Y is the arrival delay

library(nycflights13)

flights_clean <- flights %>%
  filter(!is.na(arr_delay),
         !is.na(distance))


X <- flights_clean$distance
Y <- flights_clean$arr_delay

Check Normality

#Histogram with density curve
ggplot(flights_clean, aes(x = distance)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40,
                 color = "black",
                 fill = "white") +
  geom_density()

#Q-Q plot
ggplot(flights_clean, aes(sample = distance)) +
  stat_qq() +
  stat_qq_line()

# Check skewness
mean(flights_clean$distance)
## [1] 1048.371
median(flights_clean$distance)
## [1] 888

The plots shows X doesn’t have normal distribution and heavily right tail.

#Histogram with density curve
ggplot(flights_clean, aes(x = arr_delay)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40,
                 color = "black",
                 fill = "white") +
  geom_density()

#Q-Q plot
ggplot(flights_clean, aes(sample = arr_delay)) +
  stat_qq() +
  stat_qq_line()

# Check skewness
mean(flights_clean$arr_delay)
## [1] 6.895377
median(flights_clean$arr_delay)
## [1] -5

Variable Y in this dataset also heavily right tail.

Compute the theoretical 95% confidence interval estimate for the means of X, Y and their correlation.

df <- flights_clean[, c("distance", "arr_delay")]
n <- nrow(df)

x_bar <- mean(df$distance)
s_x <- sd(df$distance)

SE_x <- s_x / sqrt(n)

CI_x <- x_bar + c(-1,1) * qnorm(0.975) * SE_x
CI_x
## [1] 1045.850 1050.892
y_bar <- mean(df$arr_delay)
s_y <- sd(df$arr_delay)

SE_y <- s_y / sqrt(n)

CI_y <- y_bar + c(-1,1) * qnorm(0.975) * SE_y
CI_y
## [1] 6.742478 7.048275
r <- cor(df$distance, df$arr_delay)
z <- 0.5 * log((1 + r)/(1 - r))
SE_z <- 1/sqrt(n - 3)
z_CI <- z + c(-1,1) * qnorm(0.975) * SE_z
r_CI <- (exp(2*z_CI) - 1)/(exp(2*z_CI) + 1)
r_CI
## [1] -0.06527959 -0.05845448

Bootstrapping method to obtain the 95% CI estimate for the means of X, Y and their correlation with B = 500, 1000, 5000.

set.seed(42)

df <- na.omit(flights_clean[, c("distance", "arr_delay")])
n <- nrow(df)

# Convert to matrix for easier resampling 
data_matrix <- as.matrix(df)

# Function to do bootstrap for a given B
bootstrap_analysis <- function(data, B, alpha = 0.05) {
  n <- nrow(data)
  
  # Storage for bootstrap statistics
  rBoot <- numeric(B)
  meanXBoot <- numeric(B)
  meanYBoot <- numeric(B)
  
  # Bootstrap loop
  for (b in 1:B) {
    # Resample with replacement 
    idx <- sample(seq_len(n), size = n, replace = TRUE)
    dataB <- data[idx, ]
    
    # Calculate statistics
    meanXBoot[b] <- mean(dataB[, 1])  # distance
    meanYBoot[b] <- mean(dataB[, 2])  # arr_delay
    rBoot[b] <- cor(dataB[, 1], dataB[, 2])
  }
  
  # Build percentile-based CIs
  ci_x <- quantile(meanXBoot, probs = c(alpha/2, 1 - alpha/2))
  ci_y <- quantile(meanYBoot, probs = c(alpha/2, 1 - alpha/2))
  ci_cor <- quantile(rBoot, probs = c(alpha/2, 1 - alpha/2))
  
  return(list(
    ci_x = ci_x,
    ci_y = ci_y,
    ci_cor = ci_cor,
    boot_dist_x = meanXBoot,
    boot_dist_y = meanYBoot,
    boot_dist_cor = rBoot
  ))
}

# Run bootstrap for B = 500, 1000, 5000
cat("=== Bootstrap with B = 500 ===\n")
## === Bootstrap with B = 500 ===
results_500 <- bootstrap_analysis(data_matrix, B = 500)
cat("Mean X CI:", results_500$ci_x, "\n")
## Mean X CI: 1045.857 1050.916
cat("Mean Y CI:", results_500$ci_y, "\n")
## Mean Y CI: 6.749913 7.058105
cat("Correlation CI:", results_500$ci_cor, "\n\n")
## Correlation CI: -0.06524722 -0.05848464
cat("=== Bootstrap with B = 1000 ===\n")
## === Bootstrap with B = 1000 ===
results_1000 <- bootstrap_analysis(data_matrix, B = 1000)
cat("Mean X CI:", results_1000$ci_x, "\n")
## Mean X CI: 1045.752 1050.968
cat("Mean Y CI:", results_1000$ci_y, "\n")
## Mean Y CI: 6.748731 7.027592
cat("Correlation CI:", results_1000$ci_cor, "\n\n")
## Correlation CI: -0.06521244 -0.05816469
cat("=== Bootstrap with B = 5000 ===\n")
## === Bootstrap with B = 5000 ===
results_5000 <- bootstrap_analysis(data_matrix, B = 5000)
cat("Mean X CI:", results_5000$ci_x, "\n")
## Mean X CI: 1045.924 1050.881
cat("Mean Y CI:", results_5000$ci_y, "\n")
## Mean Y CI: 6.751331 7.05061
cat("Correlation CI:", results_5000$ci_cor, "\n\n")
## Correlation CI: -0.0653446 -0.05827942