Tidymodels

Author

Sergio Uribe

Published

September 26, 2023

Modified

September 26, 2023

Packages

pacman::p_load(tidymodels, 
               kableExtra, 
               skimr, 
               tidyverse)

Load the penguins dataset

data("penguins")

Exploratory Data Analysis

# View the first few rows of the dataset
head(penguins)

# A tibble: 6 × 7
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           NA            NA                  NA          NA
5 Adelie  Torgersen           36.7          19.3               193        3450
6 Adelie  Torgersen           39.3          20.6               190        3650
# ℹ 1 more variable: sex <fct>

Data Preprocessing

# Remove rows with missing values for simplicity
penguins <- na.omit(penguins)

Splitting the Data

# Split the data into a training set (75%) and a testing set (25%)
set.seed(123)
data_split <- initial_split(penguins, prop = 0.75)
train_data <- training(data_split)
test_data <- testing(data_split)

Recipe

# Specify the preprocessing steps needed for the model
# In this case, we are using island, bill_length_mm, and bill_depth_mm as predictors
rec <- recipe(species ~ island + bill_length_mm + bill_depth_mm, data = train_data)

Model Specification

# Define the model to be used (Random Forest in this case)
model_spec <- rand_forest() %>%
  set_engine("ranger", mtry = 3) %>%
  set_mode("classification")

Workflow

# Combine the recipe and model specification into a workflow
workflow <- workflow() %>%
  add_recipe(rec) %>%
  add_model(model_spec)

Model Training

# Fit the model to the training data
trained_model <- workflow %>%
  fit(data = train_data)

Model Evaluation

# Use the model to make predictions on the test data
predictions <- trained_model %>%
  predict(new_data = test_data) %>%
  bind_cols(test_data)

Calculate the accuracy of the model on the test data

accuracy <- predictions %>%
  metrics(truth = species, estimate = .pred_class) %>%
  filter(.metric == "accuracy")

View the accuracy of the model

print(accuracy)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy multiclass     0.976

Confusion Matrix

conf_mat <- predictions %>%
  conf_mat(truth = species, estimate = .pred_class)

# Plot the confusion matrix
autoplot(conf_mat, type = "heatmap") +
  ggtitle("Confusion Matrix") +
  theme_minimal()

Collecting predictions

predictions %>%
  ggplot(aes(x = .pred_class, 
             y = species,
             color = species)) +
  geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) +
  labs(x = "Predicted Species",
       y = "Observed Species",
       title = "Predicted vs Observed Species") +
  theme_minimal()

--- title: "Tidymodels" author: "Sergio Uribe" date: 2023-09-26 date-modified: last-modified format: html: toc: true toc-expand: 3 code-fold: false code-tools: true editor: visual execute: echo: true warning: false message: false --- ## Packages ```{r} pacman::p_load(tidymodels, kableExtra, skimr, tidyverse) ``` # Load the penguins dataset ```{r} data("penguins") ``` # Exploratory Data Analysis ```{r} # View the first few rows of the dataset head(penguins) ``` # Data Preprocessing ```{r} # Remove rows with missing values for simplicity penguins <- na.omit(penguins) ``` # Splitting the Data ```{r} # Split the data into a training set (75%) and a testing set (25%) set.seed(123) data_split <- initial_split(penguins, prop = 0.75) train_data <- training(data_split) test_data <- testing(data_split) ``` # Recipe ```{r} # Specify the preprocessing steps needed for the model # In this case, we are using island, bill_length_mm, and bill_depth_mm as predictors rec <- recipe(species ~ island + bill_length_mm + bill_depth_mm, data = train_data) ``` # Model Specification ```{r} # Define the model to be used (Random Forest in this case) model_spec <- rand_forest() %>% set_engine("ranger", mtry = 3) %>% set_mode("classification") ``` # Workflow ```{r} # Combine the recipe and model specification into a workflow workflow <- workflow() %>% add_recipe(rec) %>% add_model(model_spec) ``` # Model Training ```{r} # Fit the model to the training data trained_model <- workflow %>% fit(data = train_data) ``` # Model Evaluation ```{r} # Use the model to make predictions on the test data predictions <- trained_model %>% predict(new_data = test_data) %>% bind_cols(test_data) ``` # Calculate the accuracy of the model on the test data ```{r} accuracy <- predictions %>% metrics(truth = species, estimate = .pred_class) %>% filter(.metric == "accuracy") ``` # View the accuracy of the model ```{r} print(accuracy) ``` # Confusion Matrix ```{r} conf_mat <- predictions %>% conf_mat(truth = species, estimate = .pred_class) # Plot the confusion matrix autoplot(conf_mat, type = "heatmap") + ggtitle("Confusion Matrix") + theme_minimal() ``` # Collecting predictions ```{r} predictions %>% ggplot(aes(x = .pred_class, y = species, color = species)) + geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) + labs(x = "Predicted Species", y = "Observed Species", title = "Predicted vs Observed Species") + theme_minimal() ```