Goal

The goal is to explore predictors of chocolate ratings using the provided dataset.

Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')

## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Examine Data

skimr::skim(chocolate)

Data summary
Name	chocolate
Number of rows	2530
Number of columns	10
_______________________
Column type frequency:
character	7
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
company_manufacturer	0	1.00	2	39	580
company_location	0	1.00	4	21	67
country_of_bean_origin	0	1.00	4	21	62
specific_bean_origin_or_bar_name	0	1.00	3	51	1605
cocoa_percent	0	1.00	3	6	46
ingredients	87	0.97	4	14	21
most_memorable_characteristics	0	1.00	3	37	2487

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ref	1	1429.80	757.65	5	802	1454.00	2079.0	2712	▆▇▇▇▇
review_date	1	2014.37	3.97	2006	2012	2015.00	2018.0	2021	▃▅▇▆▅
rating	1	3.20	0.45	1	3	3.25	3.5	4	▁▁▅▇▇

Data Cleaning & Transformation

data <- chocolate %>%
    select(-ref, -review_date, -company_location, -specific_bean_origin_or_bar_name) %>%
    na.omit() %>%
    mutate(rating = as.numeric(rating), # Ensure numeric conversion
           cocoa_percent = as.numeric(str_remove(cocoa_percent, "%")), # Convert cocoa_percent to numeric
           rating = log(rating)) %>% # Log-transform rating for normalization
    drop_na()

Define Recipe

xgboost_recipe <- recipe(rating ~ ., data = data) %>%
    update_role(company_manufacturer, new_role = "id") %>%  # Set ID column
    step_impute_median(all_numeric_predictors()) %>%  # Handle missing numeric values
    step_novel(all_nominal_predictors()) %>%  # Handle unseen categorical values
    step_dummy(all_nominal_predictors()) %>%  # Convert categorical variables to dummies
    step_zv(all_predictors())  # Remove zero-variance predictors

Prepare Recipe

prep_xgboost_recipe <- prep(xgboost_recipe, training = data)
baked_data <- bake(prep_xgboost_recipe, new_data = NULL)

Split Data into Training and Testing Sets

set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

Define XGBoost Model Specification

xgboost_spec <- 
    boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("xgboost")

Create Workflow

xgboost_workflow <- 
    workflow() %>% 
    add_recipe(xgboost_recipe) %>% 
    add_model(xgboost_spec)

Create Cross-Validation Folds

set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)

Tune Hyperparameters

set.seed(344)
xgboost_tune <- tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    metrics = metric_set(rmse, mae, rsq)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

Select Best Model

best_params <- select_best(xgboost_tune, metric = "rmse")

Finalize Model with Best Parameters

final_xgboost <- finalize_workflow(xgboost_workflow, best_params)

Fit Final Model on Training Data

final_model <- fit(final_xgboost, data = train_data)

Evaluate on Test Data

final_results <- predict(final_model, new_data = test_data) %>%
    bind_cols(test_data) %>%
    metrics(truth = rating, estimate = .pred)

Print Final Model Performance

print(final_results)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.140 
## 2 rsq     standard      0.0295
## 3 mae     standard      0.112

Final Results

final_results %>%
    select(.metric, .estimate) %>%
    print()

## # A tibble: 3 × 2
##   .metric .estimate
##   <chr>       <dbl>
## 1 rmse       0.140 
## 2 rsq        0.0295
## 3 mae        0.112

Scatter Plot of Predicted Values

library(ggplot2)

predictions <- predict(final_model, new_data = test_data) %>%
    bind_cols(test_data)

ggplot(predictions, aes(x = rating, y = .pred)) +
    geom_point(alpha = 0.5) +
    geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
    labs(title = "Actual vs. Predicted Chocolate Ratings",
         x = "Actual Rating",
         y = "Predicted Rating") +
    theme_minimal()

Chocolate Ratings

Savannah Ricker

2025-02-13

Goal

Import Data

Examine Data

Data Cleaning & Transformation

Define Recipe

Prepare Recipe

Split Data into Training and Testing Sets

Define XGBoost Model Specification

Create Workflow

Create Cross-Validation Folds

Tune Hyperparameters

Select Best Model

Finalize Model with Best Parameters

Fit Final Model on Training Data

Evaluate on Test Data

Print Final Model Performance

Final Results

Scatter Plot of Predicted Values