library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.10
## ✔ dials 1.2.1 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.3.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(ISLR)
Auto <- tibble(Auto)
Portfolio <- tibble(Portfolio)
set.seed(1)
Auto_split <- initial_split(Auto, strata = mpg, prop = 0.5)
Auto_split
## <Training/Testing/Total>
## <194/198/392>
Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
Auto_train
## # A tibble: 194 × 9
## mpg cylinders displacement horsepower weight acceleration year origin
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 15 8 350 165 3693 11.5 70 1
## 2 16 8 304 150 3433 12 70 1
## 3 14 8 440 215 4312 8.5 70 1
## 4 14 8 455 225 4425 10 70 1
## 5 10 8 307 200 4376 15 70 1
## 6 17 6 250 100 3329 15.5 71 1
## 7 14 8 400 175 4464 11.5 71 1
## 8 14 8 351 153 4154 13.5 71 1
## 9 14 8 318 150 4096 13 71 1
## 10 13 8 400 170 4746 12 71 1
## # ℹ 184 more rows
## # ℹ 1 more variable: name <fct>
Auto_test
## # A tibble: 198 × 9
## mpg cylinders displacement horsepower weight acceleration year origin
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18 8 318 150 3436 11 70 1
## 2 17 8 302 140 3449 10.5 70 1
## 3 15 8 429 198 4341 10 70 1
## 4 14 8 454 220 4354 9 70 1
## 5 15 8 390 190 3850 8.5 70 1
## 6 15 8 383 170 3563 10 70 1
## 7 14 8 340 160 3609 8 70 1
## 8 15 8 400 150 3761 9.5 70 1
## 9 14 8 455 225 3086 10 70 1
## 10 22 6 198 95 2833 15.5 70 1
## # ℹ 188 more rows
## # ℹ 1 more variable: name <fct>
lm_spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
lm_fit <- lm_spec %>%
fit(mpg ~ horsepower, data = Auto_train)
augment(lm_fit, new_data = Auto_test) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.06
augment(lm_fit, new_data = Auto_train) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.74
poly_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
step_poly(horsepower, degree = 2)
poly_wf <- workflow() %>%
add_recipe(poly_rec) %>%
add_model(lm_spec)
poly_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_poly()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.37
set.seed(2)
Auto_split <- initial_split(Auto)
Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.35
poly_tuned_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
step_poly(horsepower, degree = tune())
poly_tuned_wf <- workflow() %>%
add_recipe(poly_tuned_rec) %>%
add_model(lm_spec)
Auto_folds <- vfold_cv(Auto_train, v = 10)
Auto_folds
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [264/30]> Fold01
## 2 <split [264/30]> Fold02
## 3 <split [264/30]> Fold03
## 4 <split [264/30]> Fold04
## 5 <split [265/29]> Fold05
## 6 <split [265/29]> Fold06
## 7 <split [265/29]> Fold07
## 8 <split [265/29]> Fold08
## 9 <split [265/29]> Fold09
## 10 <split [265/29]> Fold10
degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)
tune_res <- tune_grid(
object = poly_tuned_wf,
resamples = Auto_folds,
grid = degree_grid
)
autoplot(tune_res)

collect_metrics(tune_res)
## # A tibble: 20 × 7
## degree .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 rmse standard 4.81 10 0.172 Preprocessor01_Model1
## 2 1 rsq standard 0.621 10 0.0316 Preprocessor01_Model1
## 3 2 rmse standard 4.37 10 0.209 Preprocessor02_Model1
## 4 2 rsq standard 0.677 10 0.0436 Preprocessor02_Model1
## 5 3 rmse standard 4.40 10 0.217 Preprocessor03_Model1
## 6 3 rsq standard 0.675 10 0.0446 Preprocessor03_Model1
## 7 4 rmse standard 4.43 10 0.218 Preprocessor04_Model1
## 8 4 rsq standard 0.670 10 0.0453 Preprocessor04_Model1
## 9 5 rmse standard 4.42 10 0.203 Preprocessor05_Model1
## 10 5 rsq standard 0.674 10 0.0436 Preprocessor05_Model1
## 11 6 rmse standard 4.41 10 0.189 Preprocessor06_Model1
## 12 6 rsq standard 0.670 10 0.0423 Preprocessor06_Model1
## 13 7 rmse standard 4.40 10 0.176 Preprocessor07_Model1
## 14 7 rsq standard 0.670 10 0.0420 Preprocessor07_Model1
## 15 8 rmse standard 4.41 10 0.175 Preprocessor08_Model1
## 16 8 rsq standard 0.670 10 0.0420 Preprocessor08_Model1
## 17 9 rmse standard 4.47 10 0.207 Preprocessor09_Model1
## 18 9 rsq standard 0.663 10 0.0445 Preprocessor09_Model1
## 19 10 rmse standard 4.50 10 0.227 Preprocessor10_Model1
## 20 10 rsq standard 0.658 10 0.0465 Preprocessor10_Model1
show_best(tune_res, metric = "rmse")
## # A tibble: 5 × 7
## degree .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 rmse standard 4.37 10 0.209 Preprocessor02_Model1
## 2 3 rmse standard 4.40 10 0.217 Preprocessor03_Model1
## 3 7 rmse standard 4.40 10 0.176 Preprocessor07_Model1
## 4 6 rmse standard 4.41 10 0.189 Preprocessor06_Model1
## 5 8 rmse standard 4.41 10 0.175 Preprocessor08_Model1
select_by_one_std_err(tune_res, degree, metric = "rmse")
## # A tibble: 1 × 2
## degree .config
## <dbl> <chr>
## 1 2 Preprocessor02_Model1
best_degree <- select_by_one_std_err(tune_res, degree, metric = "rmse")
final_wf <- finalize_workflow(poly_wf, best_degree)
final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_poly()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
final_fit <- fit(final_wf, Auto_train)
final_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_poly()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) horsepower_poly_1 horsepower_poly_2
## 23.34 -104.85 34.39
Portfolio_boots <- bootstraps(Portfolio, times = 1000)
Portfolio_boots
## # Bootstrap sampling
## # A tibble: 1,000 × 2
## splits id
## <list> <chr>
## 1 <split [100/36]> Bootstrap0001
## 2 <split [100/39]> Bootstrap0002
## 3 <split [100/39]> Bootstrap0003
## 4 <split [100/33]> Bootstrap0004
## 5 <split [100/39]> Bootstrap0005
## 6 <split [100/34]> Bootstrap0006
## 7 <split [100/40]> Bootstrap0007
## 8 <split [100/38]> Bootstrap0008
## 9 <split [100/36]> Bootstrap0009
## 10 <split [100/41]> Bootstrap0010
## # ℹ 990 more rows
alpha.fn <- function(split) {
data <- analysis(split)
X <- data$X
Y <- data$Y
(var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}
alpha_res <- Portfolio_boots %>%
mutate(alpha = map_dbl(splits, alpha.fn))
alpha_res
## # Bootstrap sampling
## # A tibble: 1,000 × 3
## splits id alpha
## <list> <chr> <dbl>
## 1 <split [100/36]> Bootstrap0001 0.516
## 2 <split [100/39]> Bootstrap0002 0.687
## 3 <split [100/39]> Bootstrap0003 0.599
## 4 <split [100/33]> Bootstrap0004 0.556
## 5 <split [100/39]> Bootstrap0005 0.549
## 6 <split [100/34]> Bootstrap0006 0.619
## 7 <split [100/40]> Bootstrap0007 0.387
## 8 <split [100/38]> Bootstrap0008 0.675
## 9 <split [100/36]> Bootstrap0009 0.538
## 10 <split [100/41]> Bootstrap0010 0.407
## # ℹ 990 more rows
Auto_boots <- bootstraps(Auto)
boot.fn <- function(split) {
lm_fit <- lm_spec %>% fit(mpg ~ horsepower, data = analysis(split))
tidy(lm_fit)
}
boot_res <- Auto_boots %>%
mutate(models = map(splits, boot.fn))
boot_res %>%
unnest(cols = c(models)) %>%
group_by(term) %>%
summarise(mean = mean(estimate),
sd = sd(estimate))
## # A tibble: 2 × 3
## term mean sd
## <chr> <dbl> <dbl>
## 1 (Intercept) 39.8 0.759
## 2 horsepower -0.156 0.00593