1. Find a real-world (should not be simulation) data set with at least 1,000 samples and two continue features X and Y.

California_Housing <- read.csv("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv")

x <- California_Housing$total_bedrooms
y <- California_Housing$median_income

X <- x[!is.na(x)]
Y <- y[!is.na(x)]

c(length(X), length(Y))
## [1] 20433 20433

1. (Continue) Assess the normality of X and Y.

a. Histogram

par(mfrow=c(1,2))
hist(X, breaks=60, main="Histogram of X (Total Bedrooms)", xlab="X")
hist(Y, breaks=60, main="Histogram of Y (Median Income)", xlab="Y")

par(mfrow=c(1,1))

b. Q-Q Plot

par(mfrow=c(1,2))
qqnorm(X, main="Q-Q plot of X"); qqline(X, col="red")
qqnorm(Y, main="Q-Q plot of Y"); qqline(Y, col="red")

par(mfrow=c(1,1))

2. Compute the theoretical 95% confidence interval estimate for the means of X, Y and their correlation.

a. The theoretical 95% confidence interval (CI) estimate for the means of X, Y

n <- length(X)
zcrit <- qnorm(0.975)
tcrit <- qt(0.975,n-1)
xbar <- mean(X)
ybar <- mean(Y)

sx <- sd(X)
sy <- sd(Y)

ci_mean_X <- c(xbar - tcrit*sx/sqrt(n), xbar + tcrit*sx/sqrt(n))
ci_mean_Y <- c(ybar - tcrit*sy/sqrt(n), ybar + tcrit*sy/sqrt(n))

ci_mean_X
## [1] 532.0924 543.6487
ci_mean_Y
## [1] 3.845118 3.897205

b. CI for correlation using Fisher transform

r <- cor(X, Y)

z <- atanh(r)
se_z <- 1/sqrt(n - 3)

ci_z <- c(z - zcrit*se_z, z + zcrit*se_z)
ci_r <- tanh(ci_z)

r
## [1] -0.00772285
ci_r
## [1] -0.021432134  0.005989339

3. Use bootstrapping method to obtain the 95% CI estimate for the means of X, Y and their correlation with B = 500, 1000, 5000.

a. Bootstrap function

bootstrap_ci <- function(X, Y, B) {
  n <- length(X)
  
  boot_mean_X <- numeric(B)
  boot_mean_Y <- numeric(B)
  boot_cor <- numeric(B)
  
  for (b in 1:B) {
    idx <- sample.int(n, size=n, replace=TRUE)
    xb <- X[idx]
    yb <- Y[idx]
    
    boot_mean_X[b] <- mean(xb)
    boot_mean_Y[b] <- mean(yb)
    boot_cor[b] <- cor(xb, yb)
  }
  
  ci_X <- quantile(boot_mean_X, probs=c(0.025, 0.975), names=FALSE)
  ci_Y <- quantile(boot_mean_Y, probs=c(0.025, 0.975), names=FALSE)
  ci_r <- quantile(boot_cor, probs=c(0.025, 0.975), names=FALSE)
  
  list(ci_mean_X = ci_X,
       ci_mean_Y = ci_Y,
       ci_cor = ci_r,
       boot_mean_X = boot_mean_X,
       boot_mean_Y = boot_mean_Y,
       boot_cor = boot_cor)
}

b. Obtain the means of X, Y and their correlation with B = 500, 1000, 5000.

set.seed(123)

res_500 <- bootstrap_ci(X, Y, B=500)
res_1000 <- bootstrap_ci(X, Y, B=1000)
res_5000 <- bootstrap_ci(X, Y, B=5000)

res_500$ci_mean_X; res_500$ci_mean_Y; res_500$ci_cor
## [1] 532.6358 543.6726
## [1] 3.843074 3.897400
## [1] -0.01983703  0.00600761
res_1000$ci_mean_X; res_1000$ci_mean_Y; res_1000$ci_cor
## [1] 532.2283 543.2408
## [1] 3.847700 3.896088
## [1] -0.020527398  0.006618271
res_5000$ci_mean_X; res_5000$ci_mean_Y; res_5000$ci_cor
## [1] 532.3375 543.9655
## [1] 3.845213 3.896932
## [1] -0.020398965  0.004846469
c(B500_X_mean = res_500$ci_mean_X,
  B500_Y_mean = res_500$ci_mean_Y, 
  B500_cor = res_500$ci_cor,
  B1000_X_mean = res_1000$ci_mean_X,
  B1000_Y_mean = res_1000$ci_mean_Y,
  B1000_cor = res_1000$ci_cor,
  B5000_X_mean = res_5000$ci_mean_X,
  B5000_Y_mean = res_5000$ci_mean_Y,
  B5000_cor = res_5000$ci_cor)
##  B500_X_mean1  B500_X_mean2  B500_Y_mean1  B500_Y_mean2     B500_cor1 
## 532.635779132 543.672584789   3.843074460   3.897400311  -0.019837027 
##     B500_cor2 B1000_X_mean1 B1000_X_mean2 B1000_Y_mean1 B1000_Y_mean2 
##   0.006007610 532.228331620 543.240819997   3.847700431   3.896088054 
##    B1000_cor1    B1000_cor2 B5000_X_mean1 B5000_X_mean2 B5000_Y_mean1 
##  -0.020527398   0.006618271 532.337529976 543.965528801   3.845212596 
## B5000_Y_mean2    B5000_cor1    B5000_cor2 
##   3.896931909  -0.020398965   0.004846469

4. Comment on your results by comparing estimates from step 2 and 3. What is the effect of B? Do you think the normality of X and Y also affects the accuracy of bootstrapping method?

summary_tbl <- tibble::tibble(
  Method = c("Theoretical (CLT)",
             "Bootstrap B=500",
             "Bootstrap B=1000",
             "Bootstrap B=5000"),
  CI_mean_X_low = c(ci_mean_X[1], res_500$ci_mean_X[1], res_1000$ci_mean_X[1], res_5000$ci_mean_X[1]),
  CI_mean_X_high= c(ci_mean_X[2], res_500$ci_mean_X[2], res_1000$ci_mean_X[2], res_5000$ci_mean_X[2]),
  CI_mean_Y_low = c(ci_mean_Y[1], res_500$ci_mean_Y[1], res_1000$ci_mean_Y[1], res_5000$ci_mean_Y[1]),
  CI_mean_Y_high= c(ci_mean_Y[2], res_500$ci_mean_Y[2], res_1000$ci_mean_Y[2], res_5000$ci_mean_Y[2]),
  CI_cor_low    = c(ci_r[1], res_500$ci_cor[1], res_1000$ci_cor[1], res_5000$ci_cor[1]),
  CI_cor_high   = c(ci_r[2], res_500$ci_cor[2], res_1000$ci_cor[2], res_5000$ci_cor[2])
)

summary_tbl
## # A tibble: 4 × 7
##   Method    CI_mean_X_low CI_mean_X_high CI_mean_Y_low CI_mean_Y_high CI_cor_low
##   <chr>             <dbl>          <dbl>         <dbl>          <dbl>      <dbl>
## 1 Theoreti…          532.           544.          3.85           3.90    -0.0214
## 2 Bootstra…          533.           544.          3.84           3.90    -0.0198
## 3 Bootstra…          532.           543.          3.85           3.90    -0.0205
## 4 Bootstra…          532.           544.          3.85           3.90    -0.0204
## # ℹ 1 more variable: CI_cor_high <dbl>

Comments: B has outstanding effect on the CI result while CI mean for X and Y are close. The normality of X and Y are not affect the accuracy of bootstrapping method, because the result are close. However, the size of sample will affect the result since some of the extreme value in the origin dataset can be ignored while sampling.

5. If you sample is much smaller (n < 100), what do you expect for the bootstrapping method’s accuracy?

res_50 <- bootstrap_ci(X, Y, B=50)

res_50$ci_mean_X; res_50$ci_mean_Y; res_50$ci_cor
## [1] 532.5628 543.7908
## [1] 3.848341 3.890541
## [1] -0.018140136  0.003284906

Answer: We should expect the accuracy drop significantly.