Problem 3

  1. The k-fold cross-validation approach involves randomly dividing the set of observations into k groups, or folds, of approximately equal size. The first fold is treated as a validation set, and the method is fit on the remaining k-1 folds. The mean squared error is then computed on the observations in the held-out fold. This procedure is repeated k times.

  2. In terms of variance and bias, the advantages and disadvantages of K-fold are: Compared to validation set approach = Less Variance, More Bias Compared to LOOCV approach = More Variance, Less Bias

Problem 5

require(ISLR)
## Loading required package: ISLR
data(Default)
set.seed(1)
fit1 <- glm(default ~ income + balance, data=Default, family=binomial)
summary(fit1)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
set.seed(1)
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
table(pred2, Default[-train,]$default)
##      
## pred2   No  Yes
##   No  4805  115
##   Yes   28   52
mean(Default[-train,]$default != pred2)  # test error
## [1] 0.0286

Test Error - 0.0286

Repeat 1

set.seed(2) 
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred2)  
## [1] 0.0276

Test Error = 0.0276

Repeat 2

set.seed(3)  # Repeat 2
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred2)  
## [1] 0.0248

Test Error = 0.0248

Repeat 3

set.seed(4)  # Repeat 3
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred2) 
## [1] 0.0262

Test Error = 0.0262

Error is consistent aroud 2.5% and not a very high variance.

set.seed(1)
train <- sample(nrow(Default), nrow(Default)*0.5)
fit3 <- glm(default ~ income + balance + student, data=Default, family=binomial, subset=train)
prob3 <- predict(fit3, Default[-train,], type="response")
pred3 <- ifelse(prob3 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred3) 
## [1] 0.0288

Test Error = 0.0288 There is no significant reduction from student included or removed.

Problem 6

require(ISLR)
data(Default)
set.seed(1)
fit1 <- glm(default ~ income + balance, data=Default, family=binomial)
summary(fit1)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

Estimated Standard Error using GLM: Income = 4.985e-06 Balance = 2.274e-04

set.seed(1)
boot.fn <- function(df, trainid) {
  return(coef(glm(default ~ income + balance, data=df, family=binomial, subset=trainid)))
}
boot.fn(Default, 1:nrow(Default))
##   (Intercept)        income       balance 
## -1.154047e+01  2.080898e-05  5.647103e-03
require(boot)
## Loading required package: boot
boot(Default, boot.fn, R=100)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01  9.699111e-02 4.101121e-01
## t2*  2.080898e-05  6.715005e-08 4.127740e-06
## t3*  5.647103e-03 -5.733883e-05 2.105660e-04

Standard Error using bootstrap (R=100): Income = 4.127740e-06 Balance = 2.105660e-04

With R=100, the standard errors are pretty close for both GLM versus Bootstrap Income = 4.985e-06 (GLM); 4.127740e-06(Bootstrap) Balance = 2.274e-04 (GLM); 2.105660e-04(Bootstrap)

Problem 9

require(MASS)
## Loading required package: MASS
require(boot)
data(Boston)
(medv.mu <- mean(Boston$medv))
## [1] 22.53281

mu hat = 22.53281 (Estimate)

(medv.sd <- sd(Boston$medv)/sqrt(nrow(Boston)))
## [1] 0.4088611

estimate of standard error = 0.4088611

set.seed(1)
mean.fn <- function(var, id) {
  return(mean(var[id]))
}
(boot.res <- boot(Boston$medv, mean.fn, R=100))
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = mean.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 -0.04836957   0.3815554

Estimate of standard error = 0.3815554, which is very close to the previous estimation.

boot.res$t0 - 2*sd(boot.res$t)  # lower bound
## [1] 21.7697
boot.res$t0 + 2*sd(boot.res$t)  # upper bound
## [1] 23.29592
t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

95% confidence interval: Lower bound: 21.7697 and Upper bound: 23.29592 T-test = 22.53281

(medv.median <- median(Boston$medv))
## [1] 21.2

medv.median = 21.2

set.seed(1)
median.fn <- function(var, id) {
  return(median(var[id]))
}
(boot.res <- boot(Boston$medv, median.fn, R=100))
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = median.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2  -0.026   0.3683488

Standard Error = 0.3683488

(medv.mu10 <- quantile(Boston$medv, 0.1))
##   10% 
## 12.75

10the percentile estimate of medv in boston suburbs = 12.75

set.seed(1)
quantile10.fn <- function(var, id) {
  return(quantile(var[id], 0.1))
}
(boot.res <- boot(Boston$medv, quantile10.fn, R=100))
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = quantile10.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75 -0.0985   0.4988094

Estimated Standard Error = 0.4988094