library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ recipes      1.3.0
## ✔ dials        1.4.0     ✔ rsample      1.3.0
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.8     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(ISLR)

Auto <- tibble(Auto)
Portfolio <- tibble(Portfolio)
set.seed(1)
Auto_split <- initial_split(Auto, strata = mpg, prop = 0.5)
Auto_split
## <Training/Testing/Total>
## <194/198/392>
Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
Auto_train
## # A tibble: 194 × 9
##      mpg cylinders displacement horsepower weight acceleration  year origin
##    <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl>
##  1    15         8          350        165   3693         11.5    70      1
##  2    16         8          304        150   3433         12      70      1
##  3    14         8          440        215   4312          8.5    70      1
##  4    14         8          455        225   4425         10      70      1
##  5    10         8          307        200   4376         15      70      1
##  6    17         6          250        100   3329         15.5    71      1
##  7    14         8          400        175   4464         11.5    71      1
##  8    14         8          351        153   4154         13.5    71      1
##  9    14         8          318        150   4096         13      71      1
## 10    13         8          400        170   4746         12      71      1
## # ℹ 184 more rows
## # ℹ 1 more variable: name <fct>
Auto_test
## # A tibble: 198 × 9
##      mpg cylinders displacement horsepower weight acceleration  year origin
##    <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl>
##  1    18         8          318        150   3436         11      70      1
##  2    17         8          302        140   3449         10.5    70      1
##  3    15         8          429        198   4341         10      70      1
##  4    14         8          454        220   4354          9      70      1
##  5    15         8          390        190   3850          8.5    70      1
##  6    15         8          383        170   3563         10      70      1
##  7    14         8          340        160   3609          8      70      1
##  8    15         8          400        150   3761          9.5    70      1
##  9    14         8          455        225   3086         10      70      1
## 10    22         6          198         95   2833         15.5    70      1
## # ℹ 188 more rows
## # ℹ 1 more variable: name <fct>
lm_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")
lm_fit <- lm_spec %>% 
  fit(mpg ~ horsepower, data = Auto_train)
augment(lm_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        5.06
augment(lm_fit, new_data = Auto_train) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.74
poly_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = 2)

poly_wf <- workflow() %>%
  add_recipe(poly_rec) %>%
  add_model(lm_spec)

poly_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.37
set.seed(2)
Auto_split <- initial_split(Auto)

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)

poly_fit <- fit(poly_wf, data = Auto_train)

augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.35
poly_tuned_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = tune())

poly_tuned_wf <- workflow() %>%
  add_recipe(poly_tuned_rec) %>%
  add_model(lm_spec)
poly_res <- tune_grid(
  poly_tuned_wf,
  resamples = vfold_cv(Auto_train, v = 5),
  grid = grid_regular(degree(range = c(1, 5)))
)
Auto_folds <- vfold_cv(Auto_train, v = 10)
Auto_folds
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [264/30]> Fold01
##  2 <split [264/30]> Fold02
##  3 <split [264/30]> Fold03
##  4 <split [264/30]> Fold04
##  5 <split [265/29]> Fold05
##  6 <split [265/29]> Fold06
##  7 <split [265/29]> Fold07
##  8 <split [265/29]> Fold08
##  9 <split [265/29]> Fold09
## 10 <split [265/29]> Fold10
degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)
degree_grid <- tibble(degree = seq(1, 10))
tune_res <- tune_grid(
  object = poly_tuned_wf, 
  resamples = Auto_folds, 
  grid = degree_grid
)
autoplot(tune_res)

collect_metrics(tune_res)
## # A tibble: 20 × 7
##    degree .metric .estimator  mean     n std_err .config              
##     <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1      1 rmse    standard   4.81     10  0.178  Preprocessor01_Model1
##  2      1 rsq     standard   0.632    10  0.0267 Preprocessor01_Model1
##  3      2 rmse    standard   4.34     10  0.242  Preprocessor02_Model1
##  4      2 rsq     standard   0.695    10  0.0348 Preprocessor02_Model1
##  5      3 rmse    standard   4.34     10  0.244  Preprocessor03_Model1
##  6      3 rsq     standard   0.694    10  0.0350 Preprocessor03_Model1
##  7      4 rmse    standard   4.39     10  0.253  Preprocessor04_Model1
##  8      4 rsq     standard   0.689    10  0.0348 Preprocessor04_Model1
##  9      5 rmse    standard   4.37     10  0.280  Preprocessor05_Model1
## 10      5 rsq     standard   0.694    10  0.0365 Preprocessor05_Model1
## 11      6 rmse    standard   4.35     10  0.276  Preprocessor06_Model1
## 12      6 rsq     standard   0.698    10  0.0357 Preprocessor06_Model1
## 13      7 rmse    standard   4.32     10  0.289  Preprocessor07_Model1
## 14      7 rsq     standard   0.702    10  0.0362 Preprocessor07_Model1
## 15      8 rmse    standard   4.33     10  0.292  Preprocessor08_Model1
## 16      8 rsq     standard   0.701    10  0.0364 Preprocessor08_Model1
## 17      9 rmse    standard   4.34     10  0.294  Preprocessor09_Model1
## 18      9 rsq     standard   0.699    10  0.0367 Preprocessor09_Model1
## 19     10 rmse    standard   4.34     10  0.290  Preprocessor10_Model1
## 20     10 rsq     standard   0.698    10  0.0366 Preprocessor10_Model1
show_best(tune_res, metric = "rmse")
## # A tibble: 5 × 7
##   degree .metric .estimator  mean     n std_err .config              
##    <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1      7 rmse    standard    4.32    10   0.289 Preprocessor07_Model1
## 2      8 rmse    standard    4.33    10   0.292 Preprocessor08_Model1
## 3      2 rmse    standard    4.34    10   0.242 Preprocessor02_Model1
## 4     10 rmse    standard    4.34    10   0.290 Preprocessor10_Model1
## 5      9 rmse    standard    4.34    10   0.294 Preprocessor09_Model1
select_by_one_std_err(tune_res, degree, metric = "rmse")
## # A tibble: 1 × 2
##   degree .config              
##    <int> <chr>                
## 1      2 Preprocessor02_Model1
best_degree <- select_by_one_std_err(tune_res, degree, metric = "rmse")
final_wf <- finalize_workflow(poly_wf, best_degree)

final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
final_fit <- fit(final_wf, Auto_train)

final_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##       (Intercept)  horsepower_poly_1  horsepower_poly_2  
##             23.34            -104.85              34.39
Portfolio_boots <- bootstraps(Portfolio, times = 1000)
Portfolio_boots
## # Bootstrap sampling 
## # A tibble: 1,000 × 2
##    splits           id           
##    <list>           <chr>        
##  1 <split [100/38]> Bootstrap0001
##  2 <split [100/41]> Bootstrap0002
##  3 <split [100/34]> Bootstrap0003
##  4 <split [100/33]> Bootstrap0004
##  5 <split [100/34]> Bootstrap0005
##  6 <split [100/36]> Bootstrap0006
##  7 <split [100/34]> Bootstrap0007
##  8 <split [100/40]> Bootstrap0008
##  9 <split [100/34]> Bootstrap0009
## 10 <split [100/38]> Bootstrap0010
## # ℹ 990 more rows
alpha.fn <- function(split) {
  data <- analysis(split)
  X <- data$X
  Y <- data$Y
  
  (var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}
alpha_res <- Portfolio_boots %>%
  mutate(alpha = map_dbl(splits, alpha.fn))

alpha_res
## # Bootstrap sampling 
## # A tibble: 1,000 × 3
##    splits           id            alpha
##    <list>           <chr>         <dbl>
##  1 <split [100/38]> Bootstrap0001 0.496
##  2 <split [100/41]> Bootstrap0002 0.448
##  3 <split [100/34]> Bootstrap0003 0.538
##  4 <split [100/33]> Bootstrap0004 0.808
##  5 <split [100/34]> Bootstrap0005 0.456
##  6 <split [100/36]> Bootstrap0006 0.659
##  7 <split [100/34]> Bootstrap0007 0.664
##  8 <split [100/40]> Bootstrap0008 0.722
##  9 <split [100/34]> Bootstrap0009 0.526
## 10 <split [100/38]> Bootstrap0010 0.552
## # ℹ 990 more rows
Auto_boots <- bootstraps(Auto)

boot.fn <- function(split) {
  lm_fit <- lm_spec %>% fit(mpg ~ horsepower, data = analysis(split))
  tidy(lm_fit)
}
boot_res <- Auto_boots %>%
  mutate(models = map(splits, boot.fn))
boot_res %>%
  unnest(cols = c(models)) %>%
  group_by(term) %>%
  summarise(mean = mean(estimate),
            sd = sd(estimate))
## # A tibble: 2 × 3
##   term          mean      sd
##   <chr>        <dbl>   <dbl>
## 1 (Intercept) 39.9   0.836  
## 2 horsepower  -0.158 0.00721