library(tidyverse)
library(tidymodels)
library(palmerpenguins)Module 3 Lesson 2 Application
Let’s load the tidyverse, tidymodels, and palmerpenguins packages to begin.
The data set
penguins# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
glimpse(penguins)Rows: 344
Columns: 8
$ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
$ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
$ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
$ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
$ sex <fct> male, female, female, NA, female, male, female, male…
$ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
datapeng <- penguins |>
drop_na() |>
select(-year)Data Budget
set.seed(18)
datapeng_split <- initial_split(datapeng, prop = 0.8, strata = sex)
datapeng_train <- training(datapeng_split)
datapeng_test <- testing(datapeng_split)Recipe
recipe_peng <-
recipe(sex ~., data = datapeng_train)Cross Validation
set.seed(18)
samples_peng <- vfold_cv(datapeng_train, strata = sex)Model Specification
model_peng <-
rand_forest(
mtry = tune(),
min_n = tune()
) |>
set_engine("ranger") |>
set_mode("classification")Workflow
workflow_peng <-
workflow() |>
add_recipe(recipe_peng) |>
add_model(model_peng)Parallel Processing
doParallel::registerDoParallel()Create Grid and Tune
grid <-
grid_regular(mtry(range = c(2, 10)),
min_n(range = c(2, 10)),
levels = 7)
rand_forest_ranger_spec <-
rand_forest(mtry = tune(), min_n = tune()) %>%
set_engine('ranger') %>%
set_mode('classification')
set.seed(18)
tune_peng <- tune_grid(
workflow_peng,
resamples = samples_peng,
grid = grid,
control = control_grid(save_pred = TRUE, verbose = TRUE)
)Select Best Model
tune_peng |>
show_best("accuracy")# A tibble: 5 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2 4 accuracy binary 0.911 10 0.0192 Preprocessor1_Model15
2 4 3 accuracy binary 0.907 10 0.0213 Preprocessor1_Model10
3 6 8 accuracy binary 0.907 10 0.0235 Preprocessor1_Model39
4 8 6 accuracy binary 0.907 10 0.0215 Preprocessor1_Model27
5 10 7 accuracy binary 0.907 10 0.0230 Preprocessor1_Model35
best_tune_peng <- tune_peng |>
select_best("accuracy")
best_tune_peng# A tibble: 1 × 3
mtry min_n .config
<int> <int> <chr>
1 2 4 Preprocessor1_Model15
Finalized Workflow
model_wf_final <- finalize_workflow(workflow_peng, best_tune_peng)
model_wf_final══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)
Main Arguments:
mtry = 2
min_n = 4
Computational engine: ranger
Last Fit and Test
modeleval_test <- model_wf_final %>%
last_fit(datapeng_split)
modeleval_test# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [266/67]> train/test split <tibble> <tibble> <tibble> <workflow>
Confusion Matrix
modeleval_test |>
collect_predictions() |>
conf_mat(sex, .pred_class) |>
autoplot(type = "heatmap")