Chapter 5

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(ISLR2)
library(boot)

Question 3 K-cross Validation: Involves randomly dividing the set of observations into k-groups, or folds, of approximately equal size. The first fold is treated as a validation set with the method being fitted on remaining k-1 folds.

Validation: A general approach that can be applied to almost any statistical learning method. But validation approach can overestimate test error rate as training set contains only half of the observations.

LOOCV: There is more bias-reduction, but has higher variance than k-fold CV with k < n.

Question 5

summary(Default)

##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

set.seed(1)
attach(Default)
# A
glm.reg = glm(default ~ income + balance, data = Default, family = "binomial")
# B
train <- sample(10000, 5000)
glm.fit = glm(default ~ income + balance, data = Default, family = "binomial" , subset = train)

pred.prob = predict(glm.fit, newdata = Default[-train, ], type = "response")
glm.prob = rep("No", length(pred.prob))
glm.prob[pred.prob > 0.5] = "Yes"

mean(glm.prob != Default[-train, ]$default)

## [1] 0.0254

#C

#D
train2 <- sample(10000, 5000)
glm.fit2 <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train2)

glm.pred2 <- rep("No", length(pred.prob))
pred.prob2 <- predict(glm.fit2, newdata = Default[-train2, ], type = "response")
glm.pred2[pred.prob2 > 0.5] <- "Yes"

mean(glm.pred2 != Default[-train2, ]$default)

## [1] 0.0276

Question 6

#A
set.seed(1)
attach(Default)

## The following objects are masked from Default (pos = 3):
## 
##     balance, default, income, student

six.fit <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(six.fit)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

#B
boot.fn <-  function(data,index)
{ fit <-  glm(default ~ income + balance, data = data, family = "binomial", subset = index)
return(coef(fit))}

#C
boot(Default, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2*  2.080898e-05  1.680317e-07 4.866284e-06
## t3*  5.647103e-03  1.855765e-05 2.298949e-04

#D: The estimated standard errors from the two functions had no significant difference between each other.

Question 9

attach(Boston)
#A
mu.hat <-  mean(medv)
mu.hat

## [1] 22.53281

#B
error.hat = sd(medv)/ sqrt(nrow(Boston))
error.hat

## [1] 0.4088611

#C
set.seed(1)
boot.fn <-  function(data,index)
  return(mean(data[index]))
boot(medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007650791   0.4106622

#The standard error for both computations are similar. 

#D
CI <- c(mu.hat - 2*0.4105522, mu.hat + 2*error.hat)
CI

## [1] 21.71170 23.35053

t.test(Boston$medv)

## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

#E
median.medv = median(medv)
median.medv

## [1] 21.2

#F
set.seed(1)
boot.fn <-  function(data,index)
  return(median(data[index]))
boot(medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 0.02295   0.3778075

#G
mu01 = quantile(medv, c(0.1))
mu01

##   10% 
## 12.75

#H
set.seed(1)
boot.fn <-  function(data,index)
{quantile(data[index], c(0.1))}
boot(medv, boot.fn, 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0339   0.4767526

# The standard error came out to be 0.4767526 versus 0.3778075. Although the percentile are the same at a 12.75

Chapter 5

Renee Reyes

2023-03-15