Chapter 05 (page 219): 3, 5, 6, 9
library(ISLR2)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
We now review k-fold cross-validation.
In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis. (a) Fit a logistic regression model that uses income and balance to predict default.
model_a <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model_a)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
n_obs <- nrow(Default)
train_indices <- sample(n_obs, n_obs/2)
train_set <- Default[train_indices, ]
validation_set <- Default[-train_indices, ]
model_b <- glm(default ~ income + balance, data = train_set, family = binomial)
pred_probs <- predict(model_b, newdata = validation_set, type = "response")
pred_labels <- rep("No", nrow(validation_set))
pred_labels[pred_probs > 0.5] <- "Yes"
validation_error <- mean(pred_labels != validation_set$default)
print(validation_error)
## [1] 0.0254
#Split 2
set.seed(2)
train_indices2 <-sample(n_obs, n_obs / 2)
model_split2 <- glm(default ~ income + balance, data = Default, family = binomial, subset = train_indices2)
pred_probs2 <- predict(model_split2, Default[-train_indices2, ])
pred_labels2 <- rep("No", n_obs /2)
pred_labels2[pred_probs2 > 0.5] <- "Yes"
error2 <- mean(pred_labels2 != Default$default[-train_indices2])
#Split3
set.seed(3)
train_indices3 <-sample(n_obs, n_obs / 2)
model_split3 <- glm(default ~ income + balance, data = Default, family = binomial, subset = train_indices3)
pred_probs3 <- predict(model_split3, Default[-train_indices3, ])
pred_labels3 <- rep("No", n_obs /2)
pred_labels3[pred_probs3 > 0.5] <- "Yes"
error3 <- mean(pred_labels3 != Default$default[-train_indices3])
#Split4
set.seed(4)
train_indices4 <-sample(n_obs, n_obs / 2)
model_split4 <- glm(default ~ income + balance, data = Default, family = binomial, subset = train_indices4)
pred_probs4 <- predict(model_split4, Default[-train_indices4, ])
pred_labels4 <- rep("No", n_obs /2)
pred_labels4[pred_probs2 > 0.5] <- "Yes"
error4 <- mean(pred_labels4 != Default$default[-train_indices4])
print(c(error2, error3, error4))
## [1] 0.0260 0.0260 0.0426
set.seed(1)
train_indices <- sample(n_obs, n_obs/2)
model_d <- glm(default ~ income + balance + student, data = Default, family=binomial, subset = train_indices)
pred_probs_d <- predict(model_d, Default[-train_indices, ], type = "response")
pred_labels_d <- rep("No", n_obs /2)
pred_labels_d[pred_probs_d > 0.5] <- "Yes"
error_d <- mean(pred_labels_d != Default$default[-train_indices])
print(error_d)
## [1] 0.026
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap,
library(boot)
and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
model_6a <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model_6a)$coefficients[, "Std. Error"]
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
model_6a <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model_6a)$coefficients[, "Std. Error"]
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
boot.fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data, family = binomial, subset = index)
return(coef(fit))
}
library(boot)
set.seed(1)
boot_results <- boot(data = Default, statistic = boot.fn, R = 1000)
print(boot_results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2* 2.080898e-05 1.680317e-07 4.866284e-06
## t3* 5.647103e-03 1.855765e-05 2.298949e-04
Comment on the estimated standard errors obtained using theglm() function and using your bootstrap function.
Answer: The standard errors generated by the standard mathematical formulas in glm and bootstrap function are close to one another. This tells us the statistical assumptions built for glm model is a good fit for this data set.
We will now consider the Boston housing data set, from the ISLR2 library. (a) Based on this data set, provide an estimate for the population mean of medv. Call this estimate µ.
mu_hat <- mean(Boston$medv)
print(mu_hat)
## [1] 22.53281
Answer: Standard error measures the amount that our sample mean u is expected to flucuate from the true population mean across random samples of the same size.
se_mu <- sd(Boston$medv)/sqrt(nrow(Boston))
print(se_mu)
## [1] 0.4088611
mean.fn <- function(data, index) {
return(mean(data[index]))
}
set.seed(1)
boot_mean <- boot(data = Boston$medv, statistic = mean.fn, R = 1000)
print(boot_mean)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = mean.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
Answer: The interval constructed manually using bootstrap standard error is almost identical to the output byt the t.test function.
low_bound <- mu_hat - (2 * 04119)
upp_bound <- mu_hat + (2 * 04119)
print(c(low_bound, upp_bound))
## [1] -8215.467 8260.533
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
median_hat <- median(Boston$medv)
print(median_hat)
## [1] 21.2
Answer: there is no standard equation for standard error of median, the boostrap provides a workaround. This standard error is small, showing that this sample median is a stable estimator.
median.fn <- function(data, index) {
return(median(data[index]))
}
set.seed(1)
boot_median <- boot(data = Boston$medv, statistic = median.fn, R = 1000)
print(boot_median)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = median.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.02295 0.3778075
quantile_hat <- quantile(Boston$medv, 0.1)
print(quantile_hat)
## 10%
## 12.75
quantile.fn <- function(data, index) {
return(quantile(data[index], 0.1))
}
set.seed(1)
boot_quantile <- boot(data = Boston$medv, statistic = quantile.fn, R = 1000)
print(boot_quantile)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = quantile.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0339 0.4767526