Tidymodels

Author

Sergio Uribe

Published

September 26, 2023

Modified

September 26, 2023

Packages

pacman::p_load(tidymodels, 
               kableExtra, 
               skimr, 
               tidyverse)

Load the penguins dataset

data("penguins")

Exploratory Data Analysis

# View the first few rows of the dataset
head(penguins)
# A tibble: 6 × 7
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           NA            NA                  NA          NA
5 Adelie  Torgersen           36.7          19.3               193        3450
6 Adelie  Torgersen           39.3          20.6               190        3650
# ℹ 1 more variable: sex <fct>

Data Preprocessing

# Remove rows with missing values for simplicity
penguins <- na.omit(penguins)

Splitting the Data

# Split the data into a training set (75%) and a testing set (25%)
set.seed(123)
data_split <- initial_split(penguins, prop = 0.75)
train_data <- training(data_split)
test_data <- testing(data_split)

Recipe

# Specify the preprocessing steps needed for the model
# In this case, we are using island, bill_length_mm, and bill_depth_mm as predictors
rec <- recipe(species ~ island + bill_length_mm + bill_depth_mm, data = train_data)

Model Specification

# Define the model to be used (Random Forest in this case)
model_spec <- rand_forest() %>%
  set_engine("ranger", mtry = 3) %>%
  set_mode("classification")

Workflow

# Combine the recipe and model specification into a workflow
workflow <- workflow() %>%
  add_recipe(rec) %>%
  add_model(model_spec)

Model Training

# Fit the model to the training data
trained_model <- workflow %>%
  fit(data = train_data)

Model Evaluation

# Use the model to make predictions on the test data
predictions <- trained_model %>%
  predict(new_data = test_data) %>%
  bind_cols(test_data)

Calculate the accuracy of the model on the test data

accuracy <- predictions %>%
  metrics(truth = species, estimate = .pred_class) %>%
  filter(.metric == "accuracy")

View the accuracy of the model

print(accuracy)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy multiclass     0.976

Confusion Matrix

conf_mat <- predictions %>%
  conf_mat(truth = species, estimate = .pred_class)

# Plot the confusion matrix
autoplot(conf_mat, type = "heatmap") +
  ggtitle("Confusion Matrix") +
  theme_minimal()

Collecting predictions

predictions %>%
  ggplot(aes(x = .pred_class, 
             y = species,
             color = species)) +
  geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) +
  labs(x = "Predicted Species",
       y = "Observed Species",
       title = "Predicted vs Observed Species") +
  theme_minimal()