library(ISLR)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ recipes 1.3.0
## ✔ dials 1.4.0 ✔ rsample 1.3.0
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.8 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(dplyr)
set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto) / 2)
# Fit a linear model using horsepower to predict mpg
lm_fit1 <- lm(mpg ~ horsepower, data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit1, Auto[-train_index, ]))^2)
## [1] 23.26601
# Fit a quadratic model
lm_fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit2, Auto[-train_index, ]))^2)
## [1] 18.71646
# Fit a cubic model
lm_fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit3, Auto[-train_index, ]))^2)
## [1] 18.79401
# Use tidymodels and boot::cv.glm
library(boot)
glm_fit <- glm(mpg ~ horsepower, data = Auto)
cv.glm(Auto, glm_fit)$delta[1]
## [1] 24.23151
# Try higher degree polynomials
cv_errors <- rep(0, 5)
for (d in 1:5) {
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
cv_errors[d] <- cv.glm(Auto, glm_fit)$delta[1]
}
cv_errors
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321
set.seed(17)
cv_errors_kfold <- rep(0, 10)
for (d in 1:10) {
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
cv_errors_kfold[d] <- cv.glm(Auto, glm_fit, K = 10)$delta[1]
}
cv_errors_kfold
## [1] 24.27207 19.26909 19.34805 19.29496 19.03198 18.89781 19.12061 19.14666
## [9] 18.87013 20.95520
# Function to return alpha statistic
alpha_fn <- function(data, index) {
X <- data$X[index]
Y <- data$Y[index]
return((var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y)))
}
# Test function
alpha_fn(Portfolio, 1:100)
## [1] 0.5758321
# Bootstrap the alpha function
set.seed(1)
boot(Portfolio, alpha_fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Portfolio, statistic = alpha_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5758321 -0.001596422 0.09376093