Question 3

(a) Description & Implementation

Question: Explain how k-fold cross-validation is implemented.

Answer:

(b) k-Fold CV vs. Validation Set & LOOCV

i. k-Fold CV vs Validation Set

Advantages:

Disadvantages:

ii. k-Fold CV vs LOOCV

Advantages:

Disadvantages:

Question 5

(a)

data = Default
glm.fit = glm(default ~ income + balance, data = data , family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

(b)

set.seed(123)
train = sample(dim(data)[1],round(dim(data)[1]/2),replace = FALSE)
glm.fit1 = glm(default~balance + income, data = data,subset = train,family = binomial)
prob1 = predict(glm.fit1,data[-train,],type = 'response')
class1 = rep('No',length(prob1))
class1[prob1>0.5] = 'Yes'

mean(class1 == data$default[-train])
## [1] 0.9724

Test error is about 0.0286 ~ 0.03

(C)

results = rep(0,3)
a = 1
seeds = c(10,100,50)

for (i in seeds) {
  set.seed(i)
  train = sample(dim(data)[1],round(dim(data)[1]/2),replace = FALSE)
glm.fit1 = glm(default~balance + income, data = data,subset = train,family = binomial)
prob1 = predict(glm.fit1,data[-train,],type = 'response')
class1 = rep('No',length(prob1))
class1[prob1>0.5] = 'Yes'

results[a]= mean(class1 == data$default[-train])
a = a + 1
}
results
## [1] 0.9712 0.9736 0.9774

All Test errors are about 0.029 - 0.022

(d)

set.seed(1)
train=sample(dim(data)[1],round(dim(data)[1]/2),replace=FALSE)
glm.fit1=glm(default~balance+income+student,data=data,subset=train,family=binomial)
prob1=predict(glm.fit1,data[-train,],type='response')
class1=rep('No',length(prob1))
class1[prob1>0.5]='Yes'
mean(class1==data$default[-train])
## [1] 0.974

Adding dummy variable does not reduce the test error rate

Question 6

(a)

glm.fit = glm(default ~ income + balance, data = data, family = binomial)
summary(glm.fit)$coefficients[,2]
##  (Intercept)       income      balance 
## 4.347564e-01 4.985167e-06 2.273731e-04

standard error for income and balance are 4.985e-06 and 2.274e-04, respectively

(b)

boot.fn <- function(data,index = 1:nrow(data)){
  return(coef(glm(default~income+balance,data=data,subset = index,family = binomial))[-1])
}

boot.fn(data)
##       income      balance 
## 2.080898e-05 5.647103e-03

(c)

library(boot)
set.seed(123)
boot(data,boot.fn,1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 2.080898e-05 1.582518e-07 4.729534e-06
## t2* 5.647103e-03 1.296980e-05 2.217214e-04

(d)

bootstrap method can give similar results as standard formula.

Question 9.

(a)

library(MASS)
mean(Boston$medv)
## [1] 22.53281

(b)

sd(Boston$medv)/sqrt(length(Boston$medv))
## [1] 0.4088611

(c)

boot.fn2<-function(data,index){
  return(mean(data$medv[index]))
}
boot(Boston,boot.fn2,100)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fn2, R = 100)
## 
## 
## Bootstrap Statistics :
##     original     bias    std. error
## t1* 22.53281 0.07288933   0.4182814

two results are very close as SE = .4088 and 0.4182

(d)

c(mean(Boston$medv) - 2*0.4182, mean(Boston$medv) + 2*0.4182)
## [1] 21.69641 23.36921
t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

The two results are very close to each other

(e)

median(Boston$medv)
## [1] 21.2

(f)

boot.fnmed <- function(data,index){
  return(median(Boston$medv[index]))
}
boot(Boston,boot.fnmed,100)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fnmed, R = 100)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2   0.056   0.3995123

(g)

quantile(Boston$medv,0.1)
##   10% 
## 12.75

(h)

boot.fnmed2<-function(data,index){
  return(quantile(Boston$medv[index],0.1))
}
boot(Boston,boot.fnmed2,100)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fnmed2, R = 100)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75 -0.0805   0.5020109

Standard Error is slightly larger