(A.) In k-fold cross validation, a training set is randomly divided into k groups of equal size. The first group is treated as the validation set, and the model is fit on the remaining k-1 groups.The MSE is calculated using the validation set. This procedure is repeated k times and on each occasion thevalidation set and training sets will be different than the previous one.
(B.) i. Less variance in test error estimate. A more accurate test error, as entire dataset is used. ii. Computational advantage, as less computing resources required than LOOCV. If K=10 then only 10 models need to be fitted, unlike with LOOCV where n models need fitting. Higher bias than LOOCV, as fewer observations are used, but tends to have lower variance.
#Start Question5
#(A.) & (B.)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(MASS)
set.seed(1)
require(caTools)
## Loading required package: caTools
## Warning: package 'caTools' was built under R version 3.6.3
df = Default
sample_data = sample.split(df$default, SplitRatio = 0.50)
training.set = subset(df, sample_data==TRUE)
test.set = subset(df, sample_data==FALSE)
test.default = test.set$default
lr.fit = glm(default~income+balance, data = training.set, family= binomial)
lr.probs = predict(lr.fit, test.set, type = "response")
lr.preds = rep("No", length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
table(lr.preds, test.default)
## test.default
## lr.preds No Yes
## No 4820 111
## Yes 13 56
mean(lr.preds!=test.default)
## [1] 0.0248
#Test error rate of 2.5%.
#(C.)
set.seed(12)
sample_data = sample.split(df$default, SplitRatio = 0.50)
training.set =subset(df, sample_data==TRUE)
test.set = subset(df, sample_data==FALSE)
test.default = test.set$default
lr.fit = glm(default~income+balance, data = training.set, family = binomial)
lr.probs = predict(lr.fit, test.set, type = "response")
lr.preds = rep("No", length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
table(lr.preds,test.default)
## test.default
## lr.preds No Yes
## No 4810 113
## Yes 23 54
mean(lr.preds!=test.default)
## [1] 0.0272
#Test error rate of 2.7%
set.seed(123)
sample_data = sample.split(df$default, SplitRatio = 0.50)
training.set = subset(df, sample_data==TRUE)
test.set = subset(df, sample_data==FALSE)
test.default = test.set$default
lr.fit = glm(default~income+balance, data = training.set, family = binomial)
lr.probs = predict(lr.fit,test.set, type = "response")
lr.preds = rep("No",length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
table(lr.preds,test.default)
## test.default
## lr.preds No Yes
## No 4818 106
## Yes 15 61
mean(lr.preds!=test.default)
## [1] 0.0242
#Test error rate of 2.4%
#The results are similar to each other (low variance).
#(D.)
lr.fit = glm(default~income+balance+student, data=training.set, family = binomial)
lr.probs = predict(lr.fit,test.set, type = "response")
lr.preds = rep("No",length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
contrasts(Default$student)
## Yes
## No 0
## Yes 1
table(lr.preds,test.default)
## test.default
## lr.preds No Yes
## No 4815 107
## Yes 18 60
mean(lr.preds!=test.default)
## [1] 0.025
#Test error of 2.7%, which is unchanged.
#End Question5
#Start Question6
#(A.)
set.seed(111)
lr.fit2 = glm(default~income+balance, data = df, family = binomial)
summary(lr.fit2)$coefficients[2:3,2]
## income balance
## 4.985167e-06 2.273731e-04
#(B.)
boot.fn = function(data, index){
default = data$default[index]
income = data$income[index]
balance = data$balance[index]
lr.fit2 = glm(default~income+balance, family = binomial)
return(summary(lr.fit2)$coefficients[2:3,2])
}
boot.fn(df,1:length(df$default))
## income balance
## 4.985167e-06 2.273731e-04
#(C.)
library(boot)
boot(df, boot.fn, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = df, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 4.985167e-06 2.169422e-08 1.506607e-07
## t2* 2.273731e-04 4.540846e-07 1.084125e-05
#(D.)
#The standard errors are slightly lower for both income and balance coefficients. In this case, usingbootstrapping reduces the standard error for coefficient estimates.
#End Question6
#Start Question9
library(MASS)
set.seed(1)
attach(Boston)
#(A.)
medv.mean = mean(medv)
medv.mean
## [1] 22.53281
#(B.)
medv.err = sd(medv)/sqrt(length(medv))
medv.err
## [1] 0.4088611
#(C.)
boot.fn = function(data, index) return(mean(data[index]))
library(boot)
bstrap = boot(medv, boot.fn, 1000)
bstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
#Similar to answer from (B.), (0.4119 vs 0.4089).
#(D.)
t.test(medv)
##
## One Sample t-test
##
## data: medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
c(bstrap$t0 - 2 * 0.4119, bstrap$t0 + 2 * 0.4119)
## [1] 21.70901 23.35661
#Bootstrap estimate only 0.02 away for t.test estimate.
#(E.)
medv.med = median(medv)
medv.med
## [1] 21.2
#(F.)
boot.fn = function(data, index) return(median(data[index]))
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0386 0.3770241
#Median of 21.2 with SE of 0.380. Small standard error relative to median value.
#(G.)
medv.tenth = quantile(medv, c(0.1))
medv.tenth
## 10%
## 12.75
#(H.)
boot.fn = function(data, index) return(quantile(data[index], c(0.1)))
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0186 0.4925766
#Tenth-percentile of 12.75 with SE of 0.511. Small standard error relative to tenth-percentile value.
#End QUestion9