Complete the following exercises from Introduction to Statistical Learning

Chapter 5: (5), (6), (7), and (9)

Lab 5

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
library(boot)

data(Default)
default = as.data.frame(Default)
# Removing missing values
# weekly <- na.omit(weekly)
summary(default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
dim(default)
## [1] 10000     4
seednum <- 140

Question 5

  1. Fit a logistic regression model that uses income and balance to predict default.
set.seed(seednum)
model = glm(default ~ income + balance, data = default, family = "binomial")
set.seed(seednum)
train <- sample(10000, 7000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")

pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")

val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 7000 | Validation Error: ",val_err))
## Train Size: 7000 | Validation Error:  0.024
set.seed(seednum)
train <- sample(10000, 3000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")

pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")

val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 3000 | Validation Error: ",val_err))
## Train Size: 3000 | Validation Error:  0.025
set.seed(seednum)
train <- sample(10000, 5000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")

pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")

val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 5000 | Validation Error: ",val_err))
## Train Size: 5000 | Validation Error:  0.0236
set.seed(seednum)
train <- sample(10000, 9000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")

pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")

val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 5000 | Validation Error: ",val_err))
## Train Size: 5000 | Validation Error:  0.021

Lowest Validation error with Training-Test split of 9000-1000

set.seed(seednum)
train <- sample(10000, 9000)
model <- glm(default ~ income + balance + student, data = default[train,], family = "binomial")

pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")

val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 9000 | Validation Error: ",val_err))
## Train Size: 9000 | Validation Error:  0.02

The validation error was reduced substantially with the edition of student field.

Question 6

set.seed(seednum)
model <- glm(default ~ income + balance, data = default, family = "binomial")
summary(model)$coefficients[,c("Estimate", "Std. Error")]
##                  Estimate   Std. Error
## (Intercept) -1.154047e+01 4.347564e-01
## income       2.080898e-05 4.985167e-06
## balance      5.647103e-03 2.273731e-04
set.seed(seednum)
boot_fn <- function(data, index) {
  modelfn <- glm(default ~ income + balance, data = default[index,], family = "binomial")
  return (coef(modelfn))
}
set.seed(seednum)
boot_fn <- function(data, index) {
  modelfn <- glm(default ~ income + balance, data = default[index,], family = "binomial")
  return (coef(modelfn))
}

result1 <- boot(data = default, boot_fn, R=1000)
result1
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = default, statistic = boot_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -6.371372e-02 4.433311e-01
## t2*  2.080898e-05  1.166079e-07 4.675269e-06
## t3*  5.647103e-03  3.500014e-05 2.324025e-04
  1. The values are close for intercept and income and balance to the original model summary. The income standard error is lower than the original model but higher for intercept and income.

Question 7

data(Weekly)
weekly = as.data.frame(Weekly)
# Removing missing values
# weekly <- na.omit(weekly)
summary(weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
dim(weekly)
## [1] 1089    9
set.seed(seednum)
model <- glm(Direction ~ Lag1 + Lag2, data = weekly, family = "binomial")
set.seed(seednum)
dim(weekly[-1,])
## [1] 1088    9
model <- glm(Direction ~ Lag1 + Lag2, data = weekly[-1,], family = "binomial")
set.seed(seednum)
model <- glm(Direction ~ Lag1 + Lag2, data = weekly[-1,], family = "binomial")
pred <- predict(model, newdata = weekly[1,], type = "response")
row <- weekly[1,]
pred_class <- ifelse(pred > 0.5, "Up", "Down")
cat(c("Pred: ",pred_class, " | True Val: ","Down"))
## Pred:  Up  | True Val:  Down

The prediction is incorrect.

n <- nrow(weekly)
loocv_err <- numeric(n)

for (i in 1:n) {
  set.seed(seednum)
  model <- glm(Direction ~ Lag1 + Lag2, data = weekly[-i,], family = "binomial")
  pred <- predict(model, newdata = weekly[-i,], type = "response")
  pred_class <- ifelse(pred > 0.5, "Up", "Down")
  loocv_err <- mean(weekly$Direction[i] != pred_class)
}

loocv_err_mean <- mean(loocv_err)
loocv_err_mean
## [1] 0.07169118

Question 9

data(Boston)
boston = as.data.frame(Boston)
summary(boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
dim(boston)
## [1] 506  13
# MU Hat
mean(Boston$medv)
## [1] 22.53281
data_sd <- sd(boston$medv)
data_n <- length(boston$medv)

data_se <- data_sd / sqrt(data_n)
data_se
## [1] 0.4088611
dim(boston$medv)
## NULL
mu_hat <- function(data, indices) {
  set.seed(seednum)
  sample_data <- data[indices, ]
  return(mean(sample_data$medv))
}

result <- boot(boston,statistic = mu_hat, R = 506)

se <- sd(result$t)
se
## [1] 0.4199194

The standard Error is higher for the bootstrap method.

med <- median(Boston$medv)
med
## [1] 21.2
dim(boston$medv)
## NULL
mu_hat <- function(data, indices) {
  set.seed(seednum)
  sample_data <- data[indices, ]
  return(median(sample_data$medv))
}

result <- boot(boston,statistic = mu_hat, R = 506)

se <- sd(result$t)
se
## [1] 0.3914139

The value is closer to the se for mean but lower. A 95% confidence interval for median of medv is [20.8085861, 21.5914139]

mu0_1 <- quantile(boston$medv, 0.1)
mu0_1
##   10% 
## 12.75
mu_hat <- function(data, indices) {
  set.seed(seednum)
  sample_data <- data[indices, ]
  return(quantile(sample_data$medv, 0.1))
}

result <- boot(boston,statistic = mu_hat, R = 506)

se <- sd(result$t)
se
## [1] 0.5079199

The value is higher than other values. A 95% confidence interval for 10%percentile of medv is [12.2420801, 13.2579199]