Chapter 05 (page 219): 3, 5, 6, 9
libraries:
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(boot)
First, you divide the dataset into k different equal parts (these are the folds!), then remove the first fold, and fit the model on the remaining k-1 folds to see how good the predictions are on left out fold. Repeat k different times taking out a different part each time. After running all the iterations, average k different MSE’s to get an estimated validation error rate.
Advantages of k-fold: uses all data instead of wasting some by splitting. More reliable when estimating model performance due to bias from splitting.
Disadvantages of k-fold: Requires more computation training K times instead of once.
Advantages of k-fold: Less computation when dataset is large. Lower variance when estimating model performance.
Disadvantages of k-fold: More bias depending on the K.
Setting seed:
set.seed(1)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
lrm1 <- glm(default ~ income + balance, data = Default, family = binomial)
summary(lrm1)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
sample <- sample(nrow(Default), nrow(Default)/2)
train <- Default[sample,]
val <- Default[-sample,]
lrm1_train <- glm(default ~ income + balance, data = train, family = binomial)
summary(lrm1_train)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5830 -0.1428 -0.0573 -0.0213 3.3395
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.194e+01 6.178e-01 -19.333 < 2e-16 ***
## income 3.262e-05 7.024e-06 4.644 3.41e-06 ***
## balance 5.689e-03 3.158e-04 18.014 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1523.8 on 4999 degrees of freedom
## Residual deviance: 803.3 on 4997 degrees of freedom
## AIC: 809.3
##
## Number of Fisher Scoring iterations: 8
probs <- predict(lrm1_train, newdata = val, type="response")
pred <- ifelse(probs > 0.5, "Yes", "No")
pred<- factor(pred, levels = c("No", "Yes"))
head(pred)
## 1 2 3 5 8 9
## No No No No No No
## Levels: No Yes
val_error <- mean(pred != val$default)
val_error
## [1] 0.0254
70% training - 30% validation split
set.seed(1)
sample <- sample(nrow(Default), nrow(Default)*0.7)
train <- Default[sample,]
val <- Default[-sample,]
lrm1_train2 <- glm(default ~ income + balance, data = train, family = binomial)
summary(lrm1_train2)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4481 -0.1402 -0.0561 -0.0211 3.3484
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.167e+01 5.214e-01 -22.379 < 2e-16 ***
## income 2.560e-05 6.012e-06 4.258 2.06e-05 ***
## balance 5.574e-03 2.678e-04 20.816 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2030.3 on 6999 degrees of freedom
## Residual deviance: 1079.6 on 6997 degrees of freedom
## AIC: 1085.6
##
## Number of Fisher Scoring iterations: 8
probs <- predict(lrm1_train2, newdata = val, type="response")
pred <- ifelse(probs > 0.5, "Yes", "No")
pred<- factor(pred, levels = c("No", "Yes"))
head(pred)
## 1 10 11 12 13 17
## No No No No No No
## Levels: No Yes
val_error <- mean(pred != val$default)
val_error
## [1] 0.02666667
80% training and 20% validation split
set.seed(1)
sample <- sample(nrow(Default), nrow(Default)*0.8)
train <- Default[sample,]
val <- Default[-sample,]
lrm1_train3 <- glm(default ~ income + balance, data = train, family = binomial)
summary(lrm1_train3)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4758 -0.1413 -0.0563 -0.0210 3.4620
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.168e+01 4.893e-01 -23.879 < 2e-16 ***
## income 2.547e-05 5.631e-06 4.523 6.1e-06 ***
## balance 5.613e-03 2.531e-04 22.176 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2313.6 on 7999 degrees of freedom
## Residual deviance: 1239.2 on 7997 degrees of freedom
## AIC: 1245.2
##
## Number of Fisher Scoring iterations: 8
probs <- predict(lrm1_train3, newdata = val, type="response")
pred <- ifelse(probs > 0.5, "Yes", "No")
pred<- factor(pred, levels = c("No", "Yes"))
head(pred)
## 1 11 12 13 17 20
## No No No No No No
## Levels: No Yes
val_error <- mean(pred != val$default)
val_error
## [1] 0.026
60% training and 40% validation split
set.seed(1)
sample <- sample(nrow(Default), nrow(Default)*0.6)
train <- Default[sample,]
val <- Default[-sample,]
lrm1_train4 <- glm(default ~ income + balance, data = train, family = binomial)
summary(lrm1_train4)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4930 -0.1401 -0.0559 -0.0208 3.3468
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.175e+01 5.607e-01 -20.95 < 2e-16 ***
## income 2.614e-05 6.439e-06 4.06 4.9e-05 ***
## balance 5.643e-03 2.882e-04 19.58 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1793.96 on 5999 degrees of freedom
## Residual deviance: 937.47 on 5997 degrees of freedom
## AIC: 943.47
##
## Number of Fisher Scoring iterations: 8
probs <- predict(lrm1_train4, newdata = val, type="response")
pred <- ifelse(probs > 0.5, "Yes", "No")
pred<- factor(pred, levels = c("No", "Yes"))
head(pred)
## 1 10 11 12 13 17
## No No No No No No
## Levels: No Yes
val_error <- mean(pred != val$default)
val_error
## [1] 0.025
Fitting the model with student added:
lrm2 <- glm(default ~ income + balance + student, data = train, family = binomial)
Estimating the test error:
probs2 <- predict(lrm2, newdata = val, type = "response")
pred2 <- ifelse(probs2>0.5, "Yes", "No")
val_error2 <- mean(pred2!=val$default)
val_error2
## [1] 0.026
set.seed(1)
log <- glm(default ~ income + balance, data = Default, family = binomial)
summary(log)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn = function(data,index){
re_data <- data[index,]
model<- glm(default ~ income + balance, data = re_data, family = binomial)
return(coef(model)[c("income","balance")])}
bootstrap <- boot(data = Default, statistic = boot.fn, R = 1000)
print(bootstrap)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04
boot_se <- bootstrap$t0
boot_se
## income balance
## 2.080898e-05 5.647103e-03
log_se <- summary(log)$coefficients[,"Std. Error"]
log_se
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
set.seed(1)
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
sample_sd <- sd(Boston$medv)
num_obs <- length(Boston$medv)
SE_mu_hat <- sample_sd/sqrt(num_obs)
SE_mu_hat
## [1] 0.4088611
This means that sample mean is a fairly good estimate of the true population mean of medv.If we were to take a lot of samples of he populations and calculate the mean, the sd of these means would be .4088611.
boot.fn2 <- function(data, index){
mu_hat <- mean(data[index])
return(mu_hat)
}
boot(Boston$medv, boot.fn2,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn2, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
The bootstrap estimated std. error of ˆ µ is 0.4106622 which is very similar to the estimate found in (b). This means that sample mean is a fairly good estimate of the true population mean of medv, however, slightly less as good than (b). If we were to take a lot of samples of he populations and calculate the mean, the sd of these means would be .4106622.
CI_of_mu_hat <- c(22.5328-2 *0.4107,22.5328+2*0.4107)
CI_of_mu_hat
## [1] 21.7114 23.3542
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
The bootstrap interval is very similar to the t.test interval.
hat_mu_med <- median(Boston$medv)
hat_mu_med
## [1] 21.2
boot.fn.median <- function(data, index){
return(median(data[index]))
}
boot_med<- boot(data = Boston$medv, statistic = boot.fn.median, R = 1000)
boot_med
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn.median, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0386 0.3770241
The standard error of the median is 0.3770241 suggesting if we repeatedly sampled from the population and calculated the median each time, the std. error of the medians would be 0.3770241. This is slightly lower than the mean suggesting the median is a more stable estimate for this dataset potentially due to outliers.
quantile(Boston$medv, 0.1)
## 10%
## 12.75
boot.fn3 <- function(data, index) {
return(quantile(data[index],0.1))
}
bootstrap_quant <- boot(data = Boston$medv, statistic = boot.fn3, R = 1000)
bootstrap_quant
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn3, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0186 0.4925766
The bootstrap estimate of the standard error is 0.5059657 suggesting that if we repeatedly sampled from the population and calculated the 10th percentile each time, the std. error of these would be approximately 0.5059657.
This means that the median is the most stable estimate compared to the mean and percentiles.