Goal

The goal is to explore predictors of chocolate ratings using the provided dataset.

Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')

## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Examine Data

skimr::skim(chocolate)

Data summary
Name	chocolate
Number of rows	2530
Number of columns	10
_______________________
Column type frequency:
character	7
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
company_manufacturer	0	1.00	2	39	580
company_location	0	1.00	4	21	67
country_of_bean_origin	0	1.00	4	21	62
specific_bean_origin_or_bar_name	0	1.00	3	51	1605
cocoa_percent	0	1.00	3	6	46
ingredients	87	0.97	4	14	21
most_memorable_characteristics	0	1.00	3	37	2487

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ref	1	1429.80	757.65	5	802	1454.00	2079.0	2712	▆▇▇▇▇
review_date	1	2014.37	3.97	2006	2012	2015.00	2018.0	2021	▃▅▇▆▅
rating	1	3.20	0.45	1	3	3.25	3.5	4	▁▁▅▇▇

Data Cleaning & Transformation

data <- chocolate %>%
    select(-ref, -review_date, -company_location, -specific_bean_origin_or_bar_name) %>%
    na.omit() %>%
    mutate(rating = as.numeric(rating), # Ensure numeric conversion
           cocoa_percent = as.numeric(str_remove(cocoa_percent, "%")), # Convert cocoa_percent to numeric
           rating = log(rating)) %>% # Log-transform rating for normalization
    drop_na()

Define Recipe

xgboost_recipe <- recipe(rating ~ ., data = data) %>%
    update_role(company_manufacturer, new_role = "id") %>%  # Set ID column
    step_impute_median(all_numeric_predictors()) %>%  # Handle missing numeric values
    step_novel(all_nominal_predictors()) %>%  # Handle unseen categorical values
    step_dummy(all_nominal_predictors()) %>%  # Convert categorical variables to dummies
    step_zv(all_predictors())  # Remove zero-variance predictors

Prepare Recipe

prep_xgboost_recipe <- prep(xgboost_recipe, training = data)
baked_data <- bake(prep_xgboost_recipe, new_data = NULL)

Split Data into Training and Testing Sets

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

Define XGBoost Model Specification

xgboost_spec <- 
    boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("xgboost")

Create Workflow

xgboost_workflow <- 
    workflow() %>% 
    add_recipe(xgboost_recipe) %>% 
    add_model(xgboost_spec)

Create Cross-Validation Folds

set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)

Tune Hyperparameters

set.seed(344)
xgboost_tune <- tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

Select Best Model

best_params <- select_best(xgboost_tune, metric = "rmse")

Finalize Model with Best Parameters

final_xgboost <- finalize_workflow(xgboost_workflow, best_params)

Fit Final Model on Training Data

final_model <- fit(final_xgboost, data = train_data)

Evaluate on Test Data

final_results <- predict(final_model, new_data = test_data) %>%
    bind_cols(test_data) %>%
    metrics(truth = rating, estimate = .pred)

Print Final Model Performance

print(final_results)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.140 
## 2 rsq     standard      0.0295
## 3 mae     standard      0.112

Chocolate Ratings

Savannah Ricker

2025-02-13

Goal

Import Data

Examine Data

Data Cleaning & Transformation

Define Recipe

Prepare Recipe

Split Data into Training and Testing Sets

Define XGBoost Model Specification

Create Workflow

Create Cross-Validation Folds

Tune Hyperparameters

Select Best Model

Finalize Model with Best Parameters

Fit Final Model on Training Data

Evaluate on Test Data

Print Final Model Performance