California_Housing <- read.csv("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv")
x <- California_Housing$total_bedrooms
y <- California_Housing$median_income
X <- x[!is.na(x)]
Y <- y[!is.na(x)]
c(length(X), length(Y))
## [1] 20433 20433
par(mfrow=c(1,2))
hist(X, breaks=60, main="Histogram of X (Total Bedrooms)", xlab="X")
hist(Y, breaks=60, main="Histogram of Y (Median Income)", xlab="Y")
par(mfrow=c(1,1))
par(mfrow=c(1,2))
qqnorm(X, main="Q-Q plot of X"); qqline(X, col="red")
qqnorm(Y, main="Q-Q plot of Y"); qqline(Y, col="red")
par(mfrow=c(1,1))
n <- length(X)
zcrit <- qnorm(0.975)
tcrit <- qt(0.975,n-1)
xbar <- mean(X)
ybar <- mean(Y)
sx <- sd(X)
sy <- sd(Y)
ci_mean_X <- c(xbar - tcrit*sx/sqrt(n), xbar + tcrit*sx/sqrt(n))
ci_mean_Y <- c(ybar - tcrit*sy/sqrt(n), ybar + tcrit*sy/sqrt(n))
ci_mean_X
## [1] 532.0924 543.6487
ci_mean_Y
## [1] 3.845118 3.897205
r <- cor(X, Y)
z <- atanh(r)
se_z <- 1/sqrt(n - 3)
ci_z <- c(z - zcrit*se_z, z + zcrit*se_z)
ci_r <- tanh(ci_z)
r
## [1] -0.00772285
ci_r
## [1] -0.021432134 0.005989339
bootstrap_ci <- function(X, Y, B) {
n <- length(X)
boot_mean_X <- numeric(B)
boot_mean_Y <- numeric(B)
boot_cor <- numeric(B)
for (b in 1:B) {
idx <- sample.int(n, size=n, replace=TRUE)
xb <- X[idx]
yb <- Y[idx]
boot_mean_X[b] <- mean(xb)
boot_mean_Y[b] <- mean(yb)
boot_cor[b] <- cor(xb, yb)
}
ci_X <- quantile(boot_mean_X, probs=c(0.025, 0.975), names=FALSE)
ci_Y <- quantile(boot_mean_Y, probs=c(0.025, 0.975), names=FALSE)
ci_r <- quantile(boot_cor, probs=c(0.025, 0.975), names=FALSE)
list(ci_mean_X = ci_X,
ci_mean_Y = ci_Y,
ci_cor = ci_r,
boot_mean_X = boot_mean_X,
boot_mean_Y = boot_mean_Y,
boot_cor = boot_cor)
}
set.seed(123)
res_500 <- bootstrap_ci(X, Y, B=500)
res_1000 <- bootstrap_ci(X, Y, B=1000)
res_5000 <- bootstrap_ci(X, Y, B=5000)
res_500$ci_mean_X; res_500$ci_mean_Y; res_500$ci_cor
## [1] 532.6358 543.6726
## [1] 3.843074 3.897400
## [1] -0.01983703 0.00600761
res_1000$ci_mean_X; res_1000$ci_mean_Y; res_1000$ci_cor
## [1] 532.2283 543.2408
## [1] 3.847700 3.896088
## [1] -0.020527398 0.006618271
res_5000$ci_mean_X; res_5000$ci_mean_Y; res_5000$ci_cor
## [1] 532.3375 543.9655
## [1] 3.845213 3.896932
## [1] -0.020398965 0.004846469
c(B500_X_mean = res_500$ci_mean_X,
B500_Y_mean = res_500$ci_mean_Y,
B500_cor = res_500$ci_cor,
B1000_X_mean = res_1000$ci_mean_X,
B1000_Y_mean = res_1000$ci_mean_Y,
B1000_cor = res_1000$ci_cor,
B5000_X_mean = res_5000$ci_mean_X,
B5000_Y_mean = res_5000$ci_mean_Y,
B5000_cor = res_5000$ci_cor)
## B500_X_mean1 B500_X_mean2 B500_Y_mean1 B500_Y_mean2 B500_cor1
## 532.635779132 543.672584789 3.843074460 3.897400311 -0.019837027
## B500_cor2 B1000_X_mean1 B1000_X_mean2 B1000_Y_mean1 B1000_Y_mean2
## 0.006007610 532.228331620 543.240819997 3.847700431 3.896088054
## B1000_cor1 B1000_cor2 B5000_X_mean1 B5000_X_mean2 B5000_Y_mean1
## -0.020527398 0.006618271 532.337529976 543.965528801 3.845212596
## B5000_Y_mean2 B5000_cor1 B5000_cor2
## 3.896931909 -0.020398965 0.004846469
res_50 <- bootstrap_ci(X, Y, B=50)
res_50$ci_mean_X; res_50$ci_mean_Y; res_50$ci_cor
## [1] 532.5628 543.7908
## [1] 3.848341 3.890541
## [1] -0.018140136 0.003284906
Answer: We should expect the accuracy drop significantly.
4. Comment on your results by comparing estimates from step 2 and 3. What is the effect of B? Do you think the normality of X and Y also affects the accuracy of bootstrapping method?
Comments: B has outstanding effect on the CI result while CI mean for X and Y are close. The normality of X and Y are not affect the accuracy of bootstrapping method, because the result are close. However, the size of sample will affect the result since some of the extreme value in the origin dataset can be ignored while sampling.