Explore predictors of chocolate ratings using the provided dataset.
chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- chocolate %>%
select(company_manufacturer, country_of_bean_origin, specific_bean_origin_or_bar_name,
cocoa_percent, ingredients, most_memorable_characteristics, rating) %>%
mutate(rating = as.numeric(rating),
cocoa_percent = as.numeric(str_remove(cocoa_percent, "%"))) %>%
drop_na()
set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)
xgboost_recipe <- recipe(rating ~ ., data = train_data) %>%
update_role(company_manufacturer, new_role = "id") %>%
step_impute_median(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
xgboost_spec <- boost_tree(
trees = 100, # fewer trees for tuning
min_n = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <- workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(123)
data_cv <- vfold_cv(train_data, v = 3)
set.seed(344)
xgboost_tune <- tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 10, # random combinations
metrics = metric_set(rmse, mae, rsq),
control = control_grid(save_pred = TRUE, verbose = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## New names:
## • `specific_bean_origin_or_bar_name_Maya.Mtn..Moho.R...Toledo.D...2015` ->
## `specific_bean_origin_or_bar_name_Maya.Mtn..Moho.R...Toledo.D`
## • `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...2014`
## ->
## `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...908`
## • `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...2015`
## ->
## `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...909`
## • `specific_bean_origin_or_bar_name_Paquibao.E...2016` ->
## `specific_bean_origin_or_bar_name_Paquibao.E`
## • `specific_bean_origin_or_bar_name_Pinchincha..Mindo..Coop.Nueva.Esper...2015`
## -> `specific_bean_origin_or_bar_name_Pinchincha..Mindo..Coop.Nueva.Esper`
## • `specific_bean_origin_or_bar_name_Saloy.E...2016` ->
## `specific_bean_origin_or_bar_name_Saloy.E`
## • `specific_bean_origin_or_bar_name_Taiwan...1` ->
## `specific_bean_origin_or_bar_name_Taiwan`
best_params <- select_best(xgboost_tune, metric = "rmse")
final_xgboost <- finalize_workflow(xgboost_workflow, best_params)
final_fit <- last_fit(final_xgboost, split = data_split)
## New names:
## New names:
## • `specific_bean_origin_or_bar_name_Maya.Mtn..Moho.R...Toledo.D...2015` ->
## `specific_bean_origin_or_bar_name_Maya.Mtn..Moho.R...Toledo.D`
## • `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...2014`
## ->
## `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...908`
## • `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...2015`
## ->
## `specific_bean_origin_or_bar_name_O.ahu.Island..Maunawili..Agri.Research.C...909`
## • `specific_bean_origin_or_bar_name_Paquibao.E...2016` ->
## `specific_bean_origin_or_bar_name_Paquibao.E`
## • `specific_bean_origin_or_bar_name_Pinchincha..Mindo..Coop.Nueva.Esper...2015`
## -> `specific_bean_origin_or_bar_name_Pinchincha..Mindo..Coop.Nueva.Esper`
## • `specific_bean_origin_or_bar_name_Saloy.E...2016` ->
## `specific_bean_origin_or_bar_name_Saloy.E`
## • `specific_bean_origin_or_bar_name_Taiwan...1` ->
## `specific_bean_origin_or_bar_name_Taiwan`
collect_metrics(final_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.425 Preprocessor1_Model1
## 2 rsq standard 0.0344 Preprocessor1_Model1
stopCluster(cl)
To improve the model performance, we applied the following steps:
min_n
, mtry
, and learn_rate
.Results:
In comparison to an untuned or default model, these results show only modest improvements. While the RMSE is reasonable, the R-squared value remains low, indicating the model explains only a small portion of variance in chocolate ratings. Further improvement may require richer features or external data.