Flights with X is the distance between the two location and Y is the arrival delay
library(nycflights13)
flights_clean <- flights %>%
filter(!is.na(arr_delay),
!is.na(distance))
X <- flights_clean$distance
Y <- flights_clean$arr_delay
#Histogram with density curve
ggplot(flights_clean, aes(x = distance)) +
geom_histogram(aes(y = after_stat(density)),
bins = 40,
color = "black",
fill = "white") +
geom_density()
#Q-Q plot
ggplot(flights_clean, aes(sample = distance)) +
stat_qq() +
stat_qq_line()
# Check skewness
mean(flights_clean$distance)
## [1] 1048.371
median(flights_clean$distance)
## [1] 888
The plots shows X doesn’t have normal distribution and heavily right tail.
#Histogram with density curve
ggplot(flights_clean, aes(x = arr_delay)) +
geom_histogram(aes(y = after_stat(density)),
bins = 40,
color = "black",
fill = "white") +
geom_density()
#Q-Q plot
ggplot(flights_clean, aes(sample = arr_delay)) +
stat_qq() +
stat_qq_line()
# Check skewness
mean(flights_clean$arr_delay)
## [1] 6.895377
median(flights_clean$arr_delay)
## [1] -5
Variable Y in this dataset also heavily right tail.
df <- flights_clean[, c("distance", "arr_delay")]
n <- nrow(df)
x_bar <- mean(df$distance)
s_x <- sd(df$distance)
SE_x <- s_x / sqrt(n)
CI_x <- x_bar + c(-1,1) * qnorm(0.975) * SE_x
CI_x
## [1] 1045.850 1050.892
y_bar <- mean(df$arr_delay)
s_y <- sd(df$arr_delay)
SE_y <- s_y / sqrt(n)
CI_y <- y_bar + c(-1,1) * qnorm(0.975) * SE_y
CI_y
## [1] 6.742478 7.048275
r <- cor(df$distance, df$arr_delay)
z <- 0.5 * log((1 + r)/(1 - r))
SE_z <- 1/sqrt(n - 3)
z_CI <- z + c(-1,1) * qnorm(0.975) * SE_z
r_CI <- (exp(2*z_CI) - 1)/(exp(2*z_CI) + 1)
r_CI
## [1] -0.06527959 -0.05845448
set.seed(42)
df <- na.omit(flights_clean[, c("distance", "arr_delay")])
n <- nrow(df)
# Convert to matrix for easier resampling
data_matrix <- as.matrix(df)
# Function to do bootstrap for a given B
bootstrap_analysis <- function(data, B, alpha = 0.05) {
n <- nrow(data)
# Storage for bootstrap statistics
rBoot <- numeric(B)
meanXBoot <- numeric(B)
meanYBoot <- numeric(B)
# Bootstrap loop
for (b in 1:B) {
# Resample with replacement
idx <- sample(seq_len(n), size = n, replace = TRUE)
dataB <- data[idx, ]
# Calculate statistics
meanXBoot[b] <- mean(dataB[, 1]) # distance
meanYBoot[b] <- mean(dataB[, 2]) # arr_delay
rBoot[b] <- cor(dataB[, 1], dataB[, 2])
}
# Build percentile-based CIs
ci_x <- quantile(meanXBoot, probs = c(alpha/2, 1 - alpha/2))
ci_y <- quantile(meanYBoot, probs = c(alpha/2, 1 - alpha/2))
ci_cor <- quantile(rBoot, probs = c(alpha/2, 1 - alpha/2))
return(list(
ci_x = ci_x,
ci_y = ci_y,
ci_cor = ci_cor,
boot_dist_x = meanXBoot,
boot_dist_y = meanYBoot,
boot_dist_cor = rBoot
))
}
# Run bootstrap for B = 500, 1000, 5000
cat("=== Bootstrap with B = 500 ===\n")
## === Bootstrap with B = 500 ===
results_500 <- bootstrap_analysis(data_matrix, B = 500)
cat("Mean X CI:", results_500$ci_x, "\n")
## Mean X CI: 1045.857 1050.916
cat("Mean Y CI:", results_500$ci_y, "\n")
## Mean Y CI: 6.749913 7.058105
cat("Correlation CI:", results_500$ci_cor, "\n\n")
## Correlation CI: -0.06524722 -0.05848464
cat("=== Bootstrap with B = 1000 ===\n")
## === Bootstrap with B = 1000 ===
results_1000 <- bootstrap_analysis(data_matrix, B = 1000)
cat("Mean X CI:", results_1000$ci_x, "\n")
## Mean X CI: 1045.752 1050.968
cat("Mean Y CI:", results_1000$ci_y, "\n")
## Mean Y CI: 6.748731 7.027592
cat("Correlation CI:", results_1000$ci_cor, "\n\n")
## Correlation CI: -0.06521244 -0.05816469
cat("=== Bootstrap with B = 5000 ===\n")
## === Bootstrap with B = 5000 ===
results_5000 <- bootstrap_analysis(data_matrix, B = 5000)
cat("Mean X CI:", results_5000$ci_x, "\n")
## Mean X CI: 1045.924 1050.881
cat("Mean Y CI:", results_5000$ci_y, "\n")
## Mean Y CI: 6.751331 7.05061
cat("Correlation CI:", results_5000$ci_cor, "\n\n")
## Correlation CI: -0.0653446 -0.05827942