library(tidyverse)
library(tidymodels)
library(palmerpenguins)Module 3 Lesson 1 Application
Let’s load the tidyverse, tidymodels, and palmerpenguins packages to begin.
The data set
penguins# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
glimpse(penguins)Rows: 344
Columns: 8
$ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
$ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
$ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
$ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
$ sex <fct> male, female, female, NA, female, male, female, male…
$ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
datapeng <- penguins |>
drop_na() |>
select(-year)Data Budget
set.seed(18)
datapeng_split <- initial_split(datapeng, prop = 0.8, strata = sex)
datapeng_train <- training(datapeng_split)
datapeng_test <- testing(datapeng_split)Cross Validation
set.seed(18)
samples_peng <- vfold_cv(datapeng_train, strata = sex)Model Specification
model_peng <-
logistic_reg() |>
set_engine("glm") |>
set_mode("classification")Workflow
workflow_peng <-
workflow() |>
add_formula(sex ~.) |>
add_model(model_peng)Fit Resamples and collect metrics
fit_peng <-
fit_resamples(
workflow_peng,
samples_peng
)
fit_peng |>
collect_metrics()# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.910 10 0.0147 Preprocessor1_Model1
2 roc_auc binary 0.970 10 0.0109 Preprocessor1_Model1
Final Fit and Performance
final_peng <-
last_fit(
workflow_peng,
datapeng_split
)
final_peng |>
collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.925 Preprocessor1_Model1
2 roc_auc binary 0.985 Preprocessor1_Model1
Confusion Matrix
final_peng |>
collect_predictions() |>
conf_mat(`sex`, .pred_class) |>
autoplot(type = "heatmap")