Assignment 4

(A.) In k-fold cross validation, a training set is randomly divided into k groups of equal size. The first group is treated as the validation set, and the model is fit on the remaining k-1 groups.The MSE is calculated using the validation set. This procedure is repeated k times and on each occasion thevalidation set and training sets will be different than the previous one.

(B.) i. Less variance in test error estimate. A more accurate test error, as entire dataset is used. ii. Computational advantage, as less computing resources required than LOOCV. If K=10 then only 10 models need to be fitted, unlike with LOOCV where n models need fitting. Higher bias than LOOCV, as fewer observations are used, but tends to have lower variance.

#Start Question5

#(A.) & (B.)
library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

library(MASS)

set.seed(1)
require(caTools)

## Loading required package: caTools

## Warning: package 'caTools' was built under R version 3.6.3

df = Default
sample_data = sample.split(df$default, SplitRatio = 0.50)
training.set = subset(df, sample_data==TRUE)
test.set = subset(df, sample_data==FALSE)
test.default = test.set$default

lr.fit = glm(default~income+balance, data = training.set, family= binomial)
lr.probs = predict(lr.fit, test.set, type = "response")
lr.preds = rep("No", length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
table(lr.preds, test.default)

##         test.default
## lr.preds   No  Yes
##      No  4820  111
##      Yes   13   56

mean(lr.preds!=test.default)

## [1] 0.0248

#Test error rate of 2.5%.

#(C.)
set.seed(12)

sample_data = sample.split(df$default, SplitRatio = 0.50)
training.set =subset(df, sample_data==TRUE)
test.set = subset(df, sample_data==FALSE)
test.default = test.set$default

lr.fit = glm(default~income+balance, data = training.set, family = binomial)
lr.probs = predict(lr.fit, test.set, type = "response")
lr.preds = rep("No", length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
table(lr.preds,test.default)

##         test.default
## lr.preds   No  Yes
##      No  4810  113
##      Yes   23   54

mean(lr.preds!=test.default)

## [1] 0.0272

#Test error rate of 2.7%

set.seed(123)

sample_data = sample.split(df$default, SplitRatio = 0.50)
training.set = subset(df, sample_data==TRUE)
test.set = subset(df, sample_data==FALSE)
test.default = test.set$default

lr.fit = glm(default~income+balance, data = training.set, family = binomial)
lr.probs = predict(lr.fit,test.set, type = "response")
lr.preds = rep("No",length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"
table(lr.preds,test.default)

##         test.default
## lr.preds   No  Yes
##      No  4818  106
##      Yes   15   61

mean(lr.preds!=test.default)

## [1] 0.0242

#Test error rate of 2.4%

#The results are similar to each other (low variance). 

#(D.)
lr.fit = glm(default~income+balance+student, data=training.set, family = binomial)
lr.probs = predict(lr.fit,test.set, type = "response")
lr.preds = rep("No",length(test.set$default))
lr.preds[lr.probs>0.5] = "Yes"

contrasts(Default$student)

##     Yes
## No    0
## Yes   1

table(lr.preds,test.default)

##         test.default
## lr.preds   No  Yes
##      No  4815  107
##      Yes   18   60

mean(lr.preds!=test.default)

## [1] 0.025

#Test error of 2.7%, which is unchanged.

#End Question5

#Start Question6

#(A.)
set.seed(111)

lr.fit2 = glm(default~income+balance, data = df, family = binomial)
summary(lr.fit2)$coefficients[2:3,2]

##       income      balance 
## 4.985167e-06 2.273731e-04

#(B.)
boot.fn = function(data, index){
  default = data$default[index]
  income = data$income[index]
  balance = data$balance[index]
  lr.fit2 = glm(default~income+balance, family = binomial)
  return(summary(lr.fit2)$coefficients[2:3,2])
}

boot.fn(df,1:length(df$default))

##       income      balance 
## 4.985167e-06 2.273731e-04

#(C.)
library(boot)

boot(df, boot.fn, 100)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = df, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 4.985167e-06 2.169422e-08 1.506607e-07
## t2* 2.273731e-04 4.540846e-07 1.084125e-05

#(D.)
#The standard errors are slightly lower for both income and balance coefficients. In this case, usingbootstrapping reduces the standard error for coefficient estimates.

#End Question6

#Start Question9

library(MASS)

set.seed(1)
attach(Boston)
       
#(A.) 
medv.mean = mean(medv)
medv.mean

## [1] 22.53281

#(B.)
medv.err = sd(medv)/sqrt(length(medv))
medv.err

## [1] 0.4088611

#(C.)
boot.fn = function(data, index) return(mean(data[index]))
library(boot)
bstrap = boot(medv, boot.fn, 1000)
bstrap

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007650791   0.4106622

#Similar to answer from (B.), (0.4119 vs 0.4089).

#(D.)
t.test(medv)

## 
##  One Sample t-test
## 
## data:  medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

c(bstrap$t0 - 2 * 0.4119, bstrap$t0 + 2 * 0.4119)

## [1] 21.70901 23.35661

#Bootstrap estimate only 0.02 away for t.test estimate.

#(E.) 
medv.med = median(medv)
medv.med

## [1] 21.2

#(F.)
boot.fn = function(data, index) return(median(data[index]))
boot(medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0386   0.3770241

#Median of 21.2 with SE of 0.380. Small standard error relative to median value.

#(G.)
medv.tenth = quantile(medv, c(0.1))
medv.tenth

##   10% 
## 12.75

#(H.)
boot.fn = function(data, index) return(quantile(data[index], c(0.1)))
boot(medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0186   0.4925766

#Tenth-percentile of 12.75 with SE of 0.511. Small standard error relative to tenth-percentile value.

#End QUestion9

Assignment 4

Josh Gauntt

2/13/2021