library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(boot)
## Warning: package 'boot' was built under R version 4.0.4
library(MASS)

#Question 3 k-fold cross-validation

3a. Explain how k-fold cross-validation is implemented.

The data is divided into K different parts (K = 5 or K = 10). We then remove the first part, fit the model on the remaining K-1 parts, and see how good the predicitions are. This process is repeated K different times taking out a different part each time. The k-fold cross-validation estimate is computed by averaging these values.

3b. What are the advantages and disadvantages of k-fold cross-validation relative to: i. The validation set approach? ii. LOOCV?

The validation set approach is simple and easy to implement, but the MSE can be higly variable and only a subset of observations are used to fit the model. LOOCV has less bias and produces a less variable MSE. However, LOOCV has higher variance than k-fold cross-validation. K-folds is typically used more because there is evidence that it yields test error rate estimates that suffer from neither from excessively high bias, nor from very high variance.

#Question 5

data("Default")

5A.

set.seed(3)

glm_model_1 = glm(default ~ income + balance, data = Default, family = "binomial" )

5B.

train = sample(1:1000, 500)


glm_model_2 = glm(default ~ income + balance, data = Default, subset = train, family = "binomial")

preds = predict(glm_model_2, Default[-train,], type = 'response') #taking away data/observations used in train
pred_class = ifelse(preds > .05, "Yes", "No")

mean(pred_class!=Default[-train, 'default'])
## [1] 0.1197895

5C

train1 = sample(1:1000, 500)


glm_model_3 = glm(default ~ income + balance, data = Default, subset = train1, family = "binomial")

preds = predict(glm_model_3, Default[-train1,], type = 'response') #taking away data/observations used in train
pred_class = ifelse(preds > .05, "Yes", "No")

mean(pred_class!=Default[-train1, 'default'])
## [1] 0.1265263
train2 = sample(1:1000, 500)


glm_model_4 = glm(default ~ income + balance, data = Default, subset = train2, family = "binomial")

preds = predict(glm_model_3, Default[-train2,], type = 'response') #taking away data/observations used in train
pred_class = ifelse(preds > .05, "Yes", "No")

mean(pred_class!=Default[-train2, 'default'])
## [1] 0.1274737
train3 = sample(1:1000, 500)


glm_model_5 = glm(default ~ income + balance, data = Default, subset = train3, family = "binomial")

preds = predict(glm_model_5, Default[-train3,], type = 'response') #taking away data/observations used in train
pred_class = ifelse(preds > .05, "Yes", "No")

mean(pred_class!=Default[-train3, 'default'])
## [1] 0.1257895

5D

train = sample(1:1000, 500)

glm_model_6 = glm(default ~., data = Default, subset = train, family = "binomial")

preds = predict(glm_model_6, Default[-train,], type = 'response') #taking away data/observations used in train
pred_class = ifelse(preds > .05, "Yes", "No")

mean(pred_class!=Default[-train, 'default'])
## [1] 0.1057895

Adding students seems to reduce the error a but it is not a substantial difference

#Question 6

set.seed(1)

6A

glm_model_7 = glm(default ~ income + balance, data = Default, family = "binomial")

summary(glm_model_7)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

6B

boot.fn = function(data, index) {
   return(coef(glm(default ~ income + balance,data=data,subset=index, family = "binomial")))
}

set.seed(1)
boot.fn(Default, sample(1:1000, 500, replace = T))
##   (Intercept)        income       balance 
## -1.029272e+01  1.558656e-05  5.236697e-03

6C

boot(Default, boot.fn, R=1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -3.933672e-02 4.343265e-01
## t2*  2.080898e-05  1.671659e-07 4.867306e-06
## t3*  5.647103e-03  1.850346e-05 2.298227e-04

6D

The estimations used with the glm() and bootstrap are similar.

#Question 9

data(Boston)

9A

medv_mean = mean(Boston$medv)

medv_mean
## [1] 22.53281

9B

medv_error = sd(Boston$medv)/sqrt(length(Boston$medv))
medv_error
## [1] 0.4088611

9C

set.seed(1)

boot_fn = function(data, index) return(mean(data[index]))

boot(Boston$medv, boot_fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007650791   0.4106622

The standard error in part B are similar.

9D

t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281
ci = c(medv_mean - 2 * 0.41067, medv_mean + 2 * 0.41067)
ci
## [1] 21.71147 23.35415

The estimates from my bootstrap and the t-test are similar. The confidence interval for the t-test is 21.73, 23.34 and the confidence interval using the estimate for the bootstrap is 21.72, 23.35.

9E

medv_median = median(Boston$medv)
medv_median
## [1] 21.2

9F

set.seed(1)

boot_fn_med = function(data, index) return(median(data[index]))

boot(Boston$medv, boot_fn_med, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot_fn_med, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 0.02295   0.3778075

The error for the median is small.

9G

medv_quant = quantile(Boston$medv, c(0.1))
medv_quant
##   10% 
## 12.75
set.seed(1)

boot_fn_quant = function(data, index) return(quantile(data[index], c(0.1)))

boot(Boston$medv, boot_fn_quant, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot_fn_quant, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0339   0.4767526

The error from the bootstrap the 10th quantile is small.