Goal

The goal is to explore predictors of chocolate ratings using the provided dataset.

Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Examine Data

skimr::skim(chocolate)
Data summary
Name chocolate
Number of rows 2530
Number of columns 10
_______________________
Column type frequency:
character 7
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
company_manufacturer 0 1.00 2 39 0 580 0
company_location 0 1.00 4 21 0 67 0
country_of_bean_origin 0 1.00 4 21 0 62 0
specific_bean_origin_or_bar_name 0 1.00 3 51 0 1605 0
cocoa_percent 0 1.00 3 6 0 46 0
ingredients 87 0.97 4 14 0 21 0
most_memorable_characteristics 0 1.00 3 37 0 2487 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ref 0 1 1429.80 757.65 5 802 1454.00 2079.0 2712 ▆▇▇▇▇
review_date 0 1 2014.37 3.97 2006 2012 2015.00 2018.0 2021 ▃▅▇▆▅
rating 0 1 3.20 0.45 1 3 3.25 3.5 4 ▁▁▅▇▇

Data Cleaning & Transformation

data <- chocolate %>%
    select(-ref, -review_date, -company_location, -specific_bean_origin_or_bar_name) %>%
    na.omit() %>%
    mutate(rating = as.numeric(rating), # Ensure numeric conversion
           cocoa_percent = as.numeric(str_remove(cocoa_percent, "%")), # Convert cocoa_percent to numeric
           rating = log(rating)) %>% # Log-transform rating for normalization
    drop_na()

Define Recipe

xgboost_recipe <- recipe(rating ~ ., data = data) %>%
    update_role(company_manufacturer, new_role = "id") %>%  # Set ID column
    step_impute_median(all_numeric_predictors()) %>%  # Handle missing numeric values
    step_novel(all_nominal_predictors()) %>%  # Handle unseen categorical values
    step_dummy(all_nominal_predictors()) %>%  # Convert categorical variables to dummies
    step_zv(all_predictors())  # Remove zero-variance predictors

Prepare Recipe

prep_xgboost_recipe <- prep(xgboost_recipe, training = data)
baked_data <- bake(prep_xgboost_recipe, new_data = NULL)

Split Data into Training and Testing Sets

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

Define XGBoost Model Specification

xgboost_spec <- 
    boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("xgboost") 

Create Workflow

xgboost_workflow <- 
    workflow() %>% 
    add_recipe(xgboost_recipe) %>% 
    add_model(xgboost_spec) 

Create Cross-Validation Folds

set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)

Tune Hyperparameters

set.seed(344)
xgboost_tune <- tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)
## i Creating pre-processing data to finalize unknown parameter: mtry

Select Best Model

best_params <- select_best(xgboost_tune, metric = "rmse")

Finalize Model with Best Parameters

final_xgboost <- finalize_workflow(xgboost_workflow, best_params)

Fit Final Model on Training Data

final_model <- fit(final_xgboost, data = train_data)

Evaluate on Test Data

final_results <- predict(final_model, new_data = test_data) %>%
    bind_cols(test_data) %>%
    metrics(truth = rating, estimate = .pred)