in class

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.5      ✔ recipes      1.0.10
## ✔ dials        1.2.1      ✔ rsample      1.2.1 
## ✔ dplyr        1.1.4      ✔ tibble       3.2.1 
## ✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
## ✔ infer        1.0.7      ✔ tune         1.2.1 
## ✔ modeldata    1.3.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
## ✔ purrr        1.0.2      ✔ yardstick    1.3.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(ISLR)

Auto <- tibble(Auto)
Portfolio <- tibble(Portfolio)
set.seed(1)
Auto_split <- initial_split(Auto, strata = mpg, prop = 0.5)
Auto_split

## <Training/Testing/Total>
## <194/198/392>

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
Auto_train

## # A tibble: 194 × 9
##      mpg cylinders displacement horsepower weight acceleration  year origin
##    <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl>
##  1    15         8          350        165   3693         11.5    70      1
##  2    16         8          304        150   3433         12      70      1
##  3    14         8          440        215   4312          8.5    70      1
##  4    14         8          455        225   4425         10      70      1
##  5    10         8          307        200   4376         15      70      1
##  6    17         6          250        100   3329         15.5    71      1
##  7    14         8          400        175   4464         11.5    71      1
##  8    14         8          351        153   4154         13.5    71      1
##  9    14         8          318        150   4096         13      71      1
## 10    13         8          400        170   4746         12      71      1
## # ℹ 184 more rows
## # ℹ 1 more variable: name <fct>

Auto_test

## # A tibble: 198 × 9
##      mpg cylinders displacement horsepower weight acceleration  year origin
##    <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl>
##  1    18         8          318        150   3436         11      70      1
##  2    17         8          302        140   3449         10.5    70      1
##  3    15         8          429        198   4341         10      70      1
##  4    14         8          454        220   4354          9      70      1
##  5    15         8          390        190   3850          8.5    70      1
##  6    15         8          383        170   3563         10      70      1
##  7    14         8          340        160   3609          8      70      1
##  8    15         8          400        150   3761          9.5    70      1
##  9    14         8          455        225   3086         10      70      1
## 10    22         6          198         95   2833         15.5    70      1
## # ℹ 188 more rows
## # ℹ 1 more variable: name <fct>

lm_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")
lm_fit <- lm_spec %>% 
  fit(mpg ~ horsepower, data = Auto_train)
augment(lm_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        5.06

augment(lm_fit, new_data = Auto_train) %>%
  rmse(truth = mpg, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.74

poly_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = 2)

poly_wf <- workflow() %>%
  add_recipe(poly_rec) %>%
  add_model(lm_spec)

poly_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.37

set.seed(2)
Auto_split <- initial_split(Auto)

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)

poly_fit <- fit(poly_wf, data = Auto_train)

augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.35

poly_tuned_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = tune())

poly_tuned_wf <- workflow() %>%
  add_recipe(poly_tuned_rec) %>%
  add_model(lm_spec)

Auto_folds <- vfold_cv(Auto_train, v = 10)
Auto_folds

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [264/30]> Fold01
##  2 <split [264/30]> Fold02
##  3 <split [264/30]> Fold03
##  4 <split [264/30]> Fold04
##  5 <split [265/29]> Fold05
##  6 <split [265/29]> Fold06
##  7 <split [265/29]> Fold07
##  8 <split [265/29]> Fold08
##  9 <split [265/29]> Fold09
## 10 <split [265/29]> Fold10

degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)
tune_res <- tune_grid(
  object = poly_tuned_wf, 
  resamples = Auto_folds, 
  grid = degree_grid
)
autoplot(tune_res)

collect_metrics(tune_res)

## # A tibble: 20 × 7
##    degree .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1      1 rmse    standard   4.81     10  0.172  Preprocessor01_Model1
##  2      1 rsq     standard   0.621    10  0.0316 Preprocessor01_Model1
##  3      2 rmse    standard   4.37     10  0.209  Preprocessor02_Model1
##  4      2 rsq     standard   0.677    10  0.0436 Preprocessor02_Model1
##  5      3 rmse    standard   4.40     10  0.217  Preprocessor03_Model1
##  6      3 rsq     standard   0.675    10  0.0446 Preprocessor03_Model1
##  7      4 rmse    standard   4.43     10  0.218  Preprocessor04_Model1
##  8      4 rsq     standard   0.670    10  0.0453 Preprocessor04_Model1
##  9      5 rmse    standard   4.42     10  0.203  Preprocessor05_Model1
## 10      5 rsq     standard   0.674    10  0.0436 Preprocessor05_Model1
## 11      6 rmse    standard   4.41     10  0.189  Preprocessor06_Model1
## 12      6 rsq     standard   0.670    10  0.0423 Preprocessor06_Model1
## 13      7 rmse    standard   4.40     10  0.176  Preprocessor07_Model1
## 14      7 rsq     standard   0.670    10  0.0420 Preprocessor07_Model1
## 15      8 rmse    standard   4.41     10  0.175  Preprocessor08_Model1
## 16      8 rsq     standard   0.670    10  0.0420 Preprocessor08_Model1
## 17      9 rmse    standard   4.47     10  0.207  Preprocessor09_Model1
## 18      9 rsq     standard   0.663    10  0.0445 Preprocessor09_Model1
## 19     10 rmse    standard   4.50     10  0.227  Preprocessor10_Model1
## 20     10 rsq     standard   0.658    10  0.0465 Preprocessor10_Model1

show_best(tune_res, metric = "rmse")

## # A tibble: 5 × 7
##   degree .metric .estimator  mean     n std_err .config              
##    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1      2 rmse    standard    4.37    10   0.209 Preprocessor02_Model1
## 2      3 rmse    standard    4.40    10   0.217 Preprocessor03_Model1
## 3      7 rmse    standard    4.40    10   0.176 Preprocessor07_Model1
## 4      6 rmse    standard    4.41    10   0.189 Preprocessor06_Model1
## 5      8 rmse    standard    4.41    10   0.175 Preprocessor08_Model1

select_by_one_std_err(tune_res, degree, metric = "rmse")

## # A tibble: 1 × 2
##   degree .config              
##    <dbl> <chr>                
## 1      2 Preprocessor02_Model1

best_degree <- select_by_one_std_err(tune_res, degree, metric = "rmse")
final_wf <- finalize_workflow(poly_wf, best_degree)

final_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

final_fit <- fit(final_wf, Auto_train)

final_fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##       (Intercept)  horsepower_poly_1  horsepower_poly_2  
##             23.34            -104.85              34.39

Portfolio_boots <- bootstraps(Portfolio, times = 1000)
Portfolio_boots

## # Bootstrap sampling 
## # A tibble: 1,000 × 2
##    splits           id           
##    <list>           <chr>        
##  1 <split [100/36]> Bootstrap0001
##  2 <split [100/39]> Bootstrap0002
##  3 <split [100/39]> Bootstrap0003
##  4 <split [100/33]> Bootstrap0004
##  5 <split [100/39]> Bootstrap0005
##  6 <split [100/34]> Bootstrap0006
##  7 <split [100/40]> Bootstrap0007
##  8 <split [100/38]> Bootstrap0008
##  9 <split [100/36]> Bootstrap0009
## 10 <split [100/41]> Bootstrap0010
## # ℹ 990 more rows

alpha.fn <- function(split) {
  data <- analysis(split)
  X <- data$X
  Y <- data$Y
  
  (var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}
alpha_res <- Portfolio_boots %>%
  mutate(alpha = map_dbl(splits, alpha.fn))

alpha_res

## # Bootstrap sampling 
## # A tibble: 1,000 × 3
##    splits           id            alpha
##    <list>           <chr>         <dbl>
##  1 <split [100/36]> Bootstrap0001 0.516
##  2 <split [100/39]> Bootstrap0002 0.687
##  3 <split [100/39]> Bootstrap0003 0.599
##  4 <split [100/33]> Bootstrap0004 0.556
##  5 <split [100/39]> Bootstrap0005 0.549
##  6 <split [100/34]> Bootstrap0006 0.619
##  7 <split [100/40]> Bootstrap0007 0.387
##  8 <split [100/38]> Bootstrap0008 0.675
##  9 <split [100/36]> Bootstrap0009 0.538
## 10 <split [100/41]> Bootstrap0010 0.407
## # ℹ 990 more rows

Auto_boots <- bootstraps(Auto)

boot.fn <- function(split) {
  lm_fit <- lm_spec %>% fit(mpg ~ horsepower, data = analysis(split))
  tidy(lm_fit)
}
boot_res <- Auto_boots %>%
  mutate(models = map(splits, boot.fn))
boot_res %>%
  unnest(cols = c(models)) %>%
  group_by(term) %>%
  summarise(mean = mean(estimate),
            sd = sd(estimate))

## # A tibble: 2 × 3
##   term          mean      sd
##   <chr>        <dbl>   <dbl>
## 1 (Intercept) 39.8   0.759  
## 2 horsepower  -0.156 0.00593

in class

2024-05-16