# Install if not already installed
if (!requireNamespace("MASS", quietly = TRUE)) install.packages("MASS")
if (!requireNamespace("boot", quietly = TRUE)) install.packages("boot")
library(MASS) # For generating bivariate normal samples
library(boot) # For bootstrap resampling
library(ggplot2) # For visualization
Data Source: [life-expectancy-who]https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who?resource=download
data = read.csv("LifeExpectancyData.csv")
glimpse(data)
Rows: 2,938
Columns: 22
$ Country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "A…
$ Year <int> 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008,…
$ Status <chr> "Developing", "Developing", "Developing", "Deve…
$ Life.expectancy <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58.6, 58.1,…
$ Adult.Mortality <int> 263, 271, 268, 272, 275, 279, 281, 287, 295, 29…
$ infant.deaths <int> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84, 85, 87,…
$ Alcohol <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.03,…
$ percentage.expenditure <dbl> 71.279624, 73.523582, 73.219243, 78.184215, 7.0…
$ Hepatitis.B <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64, 66, 67,…
$ Measles <int> 1154, 492, 430, 2787, 3013, 1989, 2861, 1599, 1…
$ BMI <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16.2, 15.7,…
$ under.five.deaths <int> 83, 86, 89, 93, 97, 102, 106, 110, 113, 116, 11…
$ Polio <int> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58, 58, 5, 4…
$ Total.expenditure <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.42, 8.33,…
$ Diphtheria <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58, 58, 5, …
$ HIV.AIDS <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.…
$ GDP <dbl> 584.25921, 612.69651, 631.74498, 669.95900, 63.…
$ Population <dbl> 33736494, 327582, 31731688, 3696958, 2978599, 2…
$ thinness..1.19.years <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18.6, 18.8,…
$ thinness.5.9.years <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18.7, 18.9,…
$ Income.composition.of.resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.448, 0.434…
$ Schooling <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8.7, 8.4, …
Y <- data$Life.expectancy
X <- data$percentage.expenditure
df <- na.omit(data.frame(X = X, Y = Y))
# Visualize distributions
par(mfrow = c(2, 2))
hist(df$X, main = "Histogram of X", xlab = "percentage.expenditure")
hist(df$Y, main = "Histogram of Y", xlab = "Life.expectancy")
qqnorm(df$X); qqline(df$X, col = "red")
qqnorm(df$Y); qqline(df$Y, col = "red")
Comments: The histogram for X is highly right-skewed.The Q-Q plot for X shows a strong deviation from the straight line. The histogram of Y is more symmetric. The Q-Q plot for Y shows that most points fall close to the straight line, but there are deviations at both ends, suggesting some mild skewness.
n <- nrow(df)
alpha <- 0.05
# Means and standard errors
mean_X <- mean(df$X)
mean_Y <- mean(df$Y)
se_X <- sd(df$X) / n
se_Y <- sd(df$Y) / n
z_crit <- qnorm(1 - alpha/2)
# 95% CI for means
ci_X <- c(mean_X - z_crit * se_X, mean_X + z_crit * se_X)
ci_Y <- c(mean_Y - z_crit * se_Y, mean_Y + z_crit * se_Y)
# Correlation and Fisher's z CI
r <- cor(df$X, df$Y)
z <- atanh(r)
se_r <- 1 / sqrt(n - 3)
z_ci <- c(z - z_crit * se_r, z + z_crit * se_r)
r_ci <- tanh(z_ci)
# Print results
cat("95% CI for mean of X:", ci_X, "\n")
95% CI for mean of X: 738.9885 741.6539
cat("95% CI for mean of Y:", ci_Y, "\n")
95% CI for mean of Y: 69.21856 69.23131
cat("95% CI for correlation:", r_ci, "\n")
95% CI for correlation: 0.3504878 0.4123831
# Function for bootstrapping mean
boot_mean <- function(data, indices, col) {
return(mean(data[indices, col]))
}
# Function for bootstrapping correlation
boot_cor <- function(data, indices) {
d <- data[indices, ]
return(cor(d$X, d$Y))
}
# Bootstrapping for each B
for (B in c(500, 1000, 5000)) {
set.seed(123)
boot_X <- boot(data = df, statistic = function(d, i) boot_mean(d, i, "X"), R = B)
boot_Y <- boot(data = df, statistic = function(d, i) boot_mean(d, i, "Y"), R = B)
boot_corr <- boot(data = df, statistic = boot_cor, R = B)
# Percentile CIs
ci_X_boot <- boot.ci(boot_X, type = "perc")$percent[4:5]
ci_Y_boot <- boot.ci(boot_Y, type = "perc")$percent[4:5]
ci_corr_boot <- boot.ci(boot_corr, type = "perc")$percent[4:5]
cat("\nBootstrap B =", B, "\n")
cat(" 95% CI for mean of X:", ci_X_boot, "\n")
cat(" 95% CI for mean of Y:", ci_Y_boot, "\n")
cat(" 95% CI for correlation:", ci_corr_boot, "\n")
}
Bootstrap B = 500
95% CI for mean of X: 669.9998 812.4581
95% CI for mean of Y: 68.89326 69.5792
95% CI for correlation: 0.3611118 0.4024863
Bootstrap B = 1000
95% CI for mean of X: 668.6017 820.5948
95% CI for mean of Y: 68.89212 69.55466
95% CI for correlation: 0.3636607 0.4029201
Bootstrap B = 5000
95% CI for mean of X: 671.1306 815.2249
95% CI for mean of Y: 68.8787 69.56761
95% CI for correlation: 0.3628424 0.4020531
Answer: With small samples, the bootstrap CIs become less reliable. The resampled distributions may not adequately represent the true population variability.
4. Comment on your results by comparing estimates from step 2 and 3. What is the effect of B? Do you think the normality of X and Y also affects the accuracy of bootstrapping method?
Comment: The parametric CIs for the means are much narrower than the bootstrap CIs.But the bootstrap CIs for correlation are somewhat narrower.
As B increases from 500 to 5,000, the bootstrap CIs for means and correlation become more stable and consistent, but the interval widths do not change dramatically.
No. Bootstrap methods do not require normality and are robust to non-normal and skewed data distributions.