Goal

Explore predictors of chocolate ratings using the provided dataset.

Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Cleaning & Transformation

data <- chocolate %>%
  select(company_manufacturer, country_of_bean_origin, specific_bean_origin_or_bar_name, 
         cocoa_percent, ingredients, most_memorable_characteristics, rating) %>%
  mutate(rating = as.numeric(rating), 
         cocoa_percent = as.numeric(str_remove(cocoa_percent, "%"))) %>%
  drop_na()

Data Split

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

Define Recipe

xgboost_recipe <- recipe(rating ~ ., data = train_data) %>%
  update_role(company_manufacturer, new_role = "id") %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors())

Define Model Specification (faster tuning settings)

xgboost_spec <- boost_tree(
  trees = 100,  # fewer trees for tuning
  min_n = tune(),
  mtry = tune(),
  learn_rate = tune()
) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

Create Workflow

xgboost_workflow <- workflow() %>%
  add_recipe(xgboost_recipe) %>%
  add_model(xgboost_spec)

Cross-Validation Folds (fewer for speed)

set.seed(123)
data_cv <- vfold_cv(train_data, v = 3)

Select Best Model

best_params <- select_best(xgboost_tune, metric = "rmse")

Finalize Model (increase trees here if needed)

final_xgboost <- finalize_workflow(xgboost_workflow, best_params)

Evaluate Final Model using last_fit()

final_fit <- last_fit(final_xgboost, split = data_split)
## New names:
## New names:
## • `specific_bean_origin_or_bar_name_Maya.Mtn..Moho.R...Toledo.D...2015` ->
##   `specific_bean_origin_or_bar_name_Maya.Mtn..Moho.R...Toledo.D`
## • `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...2014`
##   ->
##   `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...908`
## • `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...2015`
##   ->
##   `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...909`
## • `specific_bean_origin_or_bar_name_Paquibao.E...2016` ->
##   `specific_bean_origin_or_bar_name_Paquibao.E`
## • `specific_bean_origin_or_bar_name_Pinchincha..Mindo..Coop.Nueva.Esper...2015`
##   -> `specific_bean_origin_or_bar_name_Pinchincha..Mindo..Coop.Nueva.Esper`
## • `specific_bean_origin_or_bar_name_Saloy.E...2016` ->
##   `specific_bean_origin_or_bar_name_Saloy.E`
## • `specific_bean_origin_or_bar_name_Taiwan...1` ->
##   `specific_bean_origin_or_bar_name_Taiwan`
collect_metrics(final_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.425  Preprocessor1_Model1
## 2 rsq     standard      0.0344 Preprocessor1_Model1

Stop parallel backend

stopCluster(cl)

Summary Report

To improve the model performance, we applied the following steps:

Results:

In comparison to an untuned or default model, these results show only modest improvements. While the RMSE is reasonable, the R-squared value remains low, indicating the model explains only a small portion of variance in chocolate ratings. Further improvement may require richer features or external data.