Lab6

library(ISLR2)

summary(Default)

##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

5.)

a.)

glm_fitted <- glm(default ~ income + balance,
                  data = Default,
                  family = binomial)

summary(glm_fitted)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

b.)

set.seed(1)

# Split into training and validation set
training_set <- sample(nrow(Default), nrow(Default)/2)

# Fit Model to Training Set
glm_fitted1 <- glm(
  default ~ income + balance,
  data = Default,
  family = binomial,
  subset = training_set
) 

# Predict on Validation Set
validated <- setdiff(seq_len(nrow(Default)), training_set)
glm_prob1 <- predict(glm_fitted1, Default[validated, ], type = "response")

#Classify using threshold = 0.5:
glm_pred1 <- ifelse(glm_prob1 > 0.5, "Yes", "No")

#Compute validation error
default.valid <- Default$default[validated]
val_error1 <- mean(glm_pred1 != default.valid)
val_error1

## [1] 0.0254

c.)

We will likely see very similar error rates each time. This suggests that for this data set, the validation set approach is fairely stable because it is large enough and the relationship with balance and default is strong.

# Store the results in a vector
val_errors <- numeric(3)  

for (i in 2:4) {
  set.seed(i)
  
#New split
training_set_i <- sample(nrow(Default), nrow(Default) / 2)
validated_i <- setdiff(seq_len(nrow(Default)), training_set_i)
  
#Fit model to training set:
glm_fitted_i <- glm(default ~ income + balance,
                      data = Default,
                      family = binomial,
                      subset = training_set_i)
  
#Predict on validation set:
glm_prob_i <- predict(glm_fitted_i, Default[validated_i, ], type = "response")
glm_pred_i <- ifelse(glm_prob_i > 0.5, "Yes", "No")
  
#Compute validation error:
default_valid_i <- Default$default[validated_i]
val_error_i <- mean(glm_pred_i != default_valid_i)
  
val_errors[i - 1] <- val_error_i
cat("Seed:", i, "- Validation Error:", val_error_i, "\n")
}

## Seed: 2 - Validation Error: 0.0238 
## Seed: 3 - Validation Error: 0.0264 
## Seed: 4 - Validation Error: 0.0256

#Print Results
val_errors

## [1] 0.0238 0.0264 0.0256

d.)

It is slightly higher but still falls within the 2-3% range. There seems to be very little cahnge when adding a student. It appears that the variable balance dominates in predicting default so student doesn’t make things significantly better.

set.seed(10)  

#Split again
training_set2 <- sample(nrow(Default), nrow(Default) / 2)
validated2 <- setdiff(seq_len(nrow(Default)), training_set2)

#Fit logistic model with student included
glm_fitted2 <- glm(default ~ income + balance + student,
                   data = Default,
                   family = binomial,
                   subset = training_set2)

#Predict on validation set
glm_prob2 <- predict(glm_fitted2, Default[validated2, ], type = "response")
glm_pred2 <- ifelse(glm_prob2 > 0.5, "Yes", "No")

#Compute validation error
default_valid2 <- Default$default[validated2]
val_error2 <- mean(glm_pred2 != default_valid2)
val_error2

## [1] 0.0286

6.)

a.)

data("Default")

set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = binomial)

summary(glm.fit)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

b.)

boot.fn <- function(data, index) {
  
  fit <- glm(default ~ income + balance, data = data[index, ], family = binomial)
  return(coef(fit))
}

boot.fn(Default, 1:nrow(Default))

##   (Intercept)        income       balance 
## -1.154047e+01  2.080898e-05  5.647103e-03

c.)

library(boot)

# We will do 1000
boot.out <- boot(data = Default, statistic = boot.fn, R = 1000)

boot.out

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2*  2.080898e-05  1.680317e-07 4.866284e-06
## t3*  5.647103e-03  1.855765e-05 2.298949e-04

d.)

The observed values from both the bootstrap and glm are very similar. However, it can be obsrved that the bootstrap coefficients are slightly lower across the board.

7.)

a.)

Lag_GLM <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")

summary(Lag_GLM)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22122    0.06147   3.599 0.000319 ***
## Lag1        -0.03872    0.02622  -1.477 0.139672    
## Lag2         0.06025    0.02655   2.270 0.023232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1488.2  on 1086  degrees of freedom
## AIC: 1494.2
## 
## Number of Fisher Scoring iterations: 4

b.)

Weekly_1 <- Weekly[-1,]

Lag_GLM_minus_one <- glm(Direction ~ Lag1 + Lag2, data = Weekly_1, family = "binomial")

summary(Lag_GLM_minus_one)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly_1)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22324    0.06150   3.630 0.000283 ***
## Lag1        -0.03843    0.02622  -1.466 0.142683    
## Lag2         0.06085    0.02656   2.291 0.021971 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1494.6  on 1087  degrees of freedom
## Residual deviance: 1486.5  on 1085  degrees of freedom
## AIC: 1492.5
## 
## Number of Fisher Scoring iterations: 4

c.)

prediction1 <- ifelse(predict(Lag_GLM_minus_one, newdata = Weekly[1,], type = "response") > 0.5, "Up", "Down")
prediction1 == Weekly[1,]$Direction

## [1] FALSE

Interpretation: The observation we made seems to be classified incorrectly, we predicted up but the price went down.

d.)

error <- numeric(dim(Weekly)[1])
for (i in 1:dim(Weekly)[1])
{
trainDat <- Weekly[-i,]
modFit <- glm(Direction ~ Lag1 + Lag2, data = trainDat, family = "binomial")
pred <- ifelse(predict(modFit, newdata = Weekly[i,], type = "response") > 0.5,
"Up", "Down")
if (pred != Weekly[i,]$Direction)
{
error[i] <- 1
}}

e.) The LOOCV error can be seen reported below, I estimate that, we would wrongly classify weeks as Up when the price is Down or classify as Down when the price is Up about 45 percent of the time.

mean(error)

## [1] 0.4499541

9.)

a.)

medvMean <- mean(Boston$medv)
medvMean

## [1] 22.53281

b.)

sdMedv <- sd(Boston$medv)
SEmedvMean <- sdMedv / sqrt(nrow(Boston))
SEmedvMean

## [1] 0.4088611

-Interpretation: This means that if we repeatedly drew samples of size 506, the sample means would typically deviate from the true mean by about 0.409 units (or dollars, if the unit is monetary). This low SE indicates that the sample mean is a stable and precise estimator.

c.)

library(TeachingDemos)

set.seed(char2seed("Spencer"))
meanBootMedv <- function(data, index) {
  mean(data[index, ]$medv)
}
bootmedv <- boot(Boston, meanBootMedv, R = 1000)
SEmedvMeanBS <- sd(bootmedv$t)
SEmedvMeanBS

## [1] 0.4027572

-Interpretation: The bootstrap estimate is very close to the theoretical value from part (b). This close correspondence enhances our confidence in the reliability of the standard error estimate.

d.)

CI <- c(mean(Boston$medv) - 2 * SEmedvMeanBS, mean(Boston$medv) + 2 * SEmedvMeanBS)
CI

## [1] 21.72729 23.33832

t.test(Boston$medv)$conf.int

## [1] 21.72953 23.33608
## attr(,"conf.level")
## [1] 0.95

-Interpretation: Both methods yield nearly identical results. The slight variation is expected given the large sample size, where the Normal approximation is quite accurate.

e.)

medianMedvEst <- median(Boston$medv)
medianMedvEst

## [1] 21.2

f.)

set.seed(char2seed("Spencer"))
bootMed <- function(data, index) {
  median(data[index, ]$medv)
}
bootMedianMedv <- boot(Boston, bootMed, R = 1000)
SEMedianMedvBS <- sd(bootMedianMedv$t)
SEMedianMedvBS

## [1] 0.3671808

Interpretation: This implies that if many samples were taken, the median values would typically vary by about 0.367 units. This quantifies the uncertainty around our median estimate.

g.)

TenPctMedvEst <- quantile(Boston$medv, probs = 0.1)
TenPctMedvEst

##   10% 
## 12.75

-Interpretation: This value represents the threshold below which 10% of the Boston suburbs’ median home values fall.

h.)

set.seed(char2seed("Spencer"))
bootTen <- function(data, index) {
  quantile(data[index, ]$medv, probs = 0.1)
}
bootTenMedvBS <- boot(Boston, bootTen, R = 1000)
SETenPctMedvBS <- sd(bootTenMedvBS$t)
SETenPctMedvBS

## [1] 0.4944738

-Interpretation: This means that when repeatedly sampling 506 Boston suburbs, the estimated tenth percentile would typically fluctuate by roughly 0.494. It gives a measure of uncertainty associated with this percentile estimate.