# View the first few rows of the datasethead(penguins)
# A tibble: 6 × 7
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
# ℹ 1 more variable: sex <fct>
Data Preprocessing
# Remove rows with missing values for simplicitypenguins <-na.omit(penguins)
Splitting the Data
# Split the data into a training set (75%) and a testing set (25%)set.seed(123)data_split <-initial_split(penguins, prop =0.75)train_data <-training(data_split)test_data <-testing(data_split)
Recipe
# Specify the preprocessing steps needed for the model# In this case, we are using island, bill_length_mm, and bill_depth_mm as predictorsrec <-recipe(species ~ island + bill_length_mm + bill_depth_mm, data = train_data)
Model Specification
# Define the model to be used (Random Forest in this case)model_spec <-rand_forest() %>%set_engine("ranger", mtry =3) %>%set_mode("classification")
Workflow
# Combine the recipe and model specification into a workflowworkflow <-workflow() %>%add_recipe(rec) %>%add_model(model_spec)
Model Training
# Fit the model to the training datatrained_model <- workflow %>%fit(data = train_data)
Model Evaluation
# Use the model to make predictions on the test datapredictions <- trained_model %>%predict(new_data = test_data) %>%bind_cols(test_data)
Calculate the accuracy of the model on the test data
---title: "Tidymodels"author: "Sergio Uribe"date: 2023-09-26date-modified: last-modifiedformat: html: toc: truetoc-expand: 3code-fold: falsecode-tools: trueeditor: visualexecute: echo: true warning: false message: false---## Packages```{r}pacman::p_load(tidymodels, kableExtra, skimr, tidyverse)```# Load the penguins dataset```{r}data("penguins")```# Exploratory Data Analysis ```{r}# View the first few rows of the datasethead(penguins)```# Data Preprocessing```{r}# Remove rows with missing values for simplicitypenguins <-na.omit(penguins)```# Splitting the Data```{r}# Split the data into a training set (75%) and a testing set (25%)set.seed(123)data_split <-initial_split(penguins, prop =0.75)train_data <-training(data_split)test_data <-testing(data_split)```# Recipe```{r}# Specify the preprocessing steps needed for the model# In this case, we are using island, bill_length_mm, and bill_depth_mm as predictorsrec <-recipe(species ~ island + bill_length_mm + bill_depth_mm, data = train_data)```# Model Specification```{r}# Define the model to be used (Random Forest in this case)model_spec <-rand_forest() %>%set_engine("ranger", mtry =3) %>%set_mode("classification")```# Workflow```{r}# Combine the recipe and model specification into a workflowworkflow <-workflow() %>%add_recipe(rec) %>%add_model(model_spec)```# Model Training```{r}# Fit the model to the training datatrained_model <- workflow %>%fit(data = train_data)```# Model Evaluation```{r}# Use the model to make predictions on the test datapredictions <- trained_model %>%predict(new_data = test_data) %>%bind_cols(test_data)```# Calculate the accuracy of the model on the test data```{r}accuracy <- predictions %>%metrics(truth = species, estimate = .pred_class) %>%filter(.metric =="accuracy")```# View the accuracy of the model```{r}print(accuracy)```# Confusion Matrix```{r}conf_mat <- predictions %>%conf_mat(truth = species, estimate = .pred_class)# Plot the confusion matrixautoplot(conf_mat, type ="heatmap") +ggtitle("Confusion Matrix") +theme_minimal()```# Collecting predictions```{r}predictions %>%ggplot(aes(x = .pred_class, y = species,color = species)) +geom_jitter(width =0.2, height =0.2, alpha =0.5) +labs(x ="Predicted Species",y ="Observed Species",title ="Predicted vs Observed Species") +theme_minimal()```