Module 3 Lesson 2 Application

Author

Jamal Rogers

Published

September 8, 2023

Let’s load the tidyverse, tidymodels, and palmerpenguins packages to begin.

library(tidyverse)
library(tidymodels)
library(palmerpenguins)

The data set

penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
glimpse(penguins)
Rows: 344
Columns: 8
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
$ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
$ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
$ sex               <fct> male, female, female, NA, female, male, female, male…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
datapeng <- penguins |>
        drop_na() |>
        select(-year)

Data Budget

set.seed(18)
datapeng_split <- initial_split(datapeng, prop = 0.8, strata = sex)
datapeng_train <- training(datapeng_split)
datapeng_test <- testing(datapeng_split)

Recipe

recipe_peng <-
  recipe(sex ~., data = datapeng_train)

Cross Validation

set.seed(18)
samples_peng <- vfold_cv(datapeng_train, strata = sex)

Model Specification

model_peng <-
        rand_forest(
          mtry = tune(),
          min_n = tune()
        ) |>
        set_engine("ranger") |>
        set_mode("classification")

Workflow

workflow_peng <-
  workflow() |>
  add_recipe(recipe_peng) |>
  add_model(model_peng)

Parallel Processing

doParallel::registerDoParallel()

Create Grid and Tune

grid <-
  grid_regular(mtry(range = c(2, 10)),
               min_n(range = c(2, 10)),
               levels = 7)
rand_forest_ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune()) %>%
  set_engine('ranger') %>%
  set_mode('classification')


set.seed(18)
tune_peng <- tune_grid(
  workflow_peng,
  resamples = samples_peng,
  grid = grid,
  control = control_grid(save_pred = TRUE, verbose = TRUE)
)

Select Best Model

tune_peng |>
        show_best("accuracy")
# A tibble: 5 × 8
   mtry min_n .metric  .estimator  mean     n std_err .config              
  <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
1     2     4 accuracy binary     0.911    10  0.0192 Preprocessor1_Model15
2     4     3 accuracy binary     0.907    10  0.0213 Preprocessor1_Model10
3     6     8 accuracy binary     0.907    10  0.0235 Preprocessor1_Model39
4     8     6 accuracy binary     0.907    10  0.0215 Preprocessor1_Model27
5    10     7 accuracy binary     0.907    10  0.0230 Preprocessor1_Model35
best_tune_peng <- tune_peng |>
        select_best("accuracy")
best_tune_peng
# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1     2     4 Preprocessor1_Model15

Finalized Workflow

model_wf_final <- finalize_workflow(workflow_peng, best_tune_peng)
model_wf_final
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  mtry = 2
  min_n = 4

Computational engine: ranger 

Last Fit and Test

modeleval_test <- model_wf_final %>%
        last_fit(datapeng_split)
modeleval_test
# Resampling results
# Manual resampling 
# A tibble: 1 × 6
  splits           id               .metrics .notes   .predictions .workflow 
  <list>           <chr>            <list>   <list>   <list>       <list>    
1 <split [266/67]> train/test split <tibble> <tibble> <tibble>     <workflow>

Confusion Matrix

modeleval_test |>
        collect_predictions() |>
        conf_mat(sex, .pred_class) |>
        autoplot(type = "heatmap")