The goal is to explore predictors of chocolate ratings using the provided dataset.
chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
Name | chocolate |
Number of rows | 2530 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 7 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
company_manufacturer | 0 | 1.00 | 2 | 39 | 0 | 580 | 0 |
company_location | 0 | 1.00 | 4 | 21 | 0 | 67 | 0 |
country_of_bean_origin | 0 | 1.00 | 4 | 21 | 0 | 62 | 0 |
specific_bean_origin_or_bar_name | 0 | 1.00 | 3 | 51 | 0 | 1605 | 0 |
cocoa_percent | 0 | 1.00 | 3 | 6 | 0 | 46 | 0 |
ingredients | 87 | 0.97 | 4 | 14 | 0 | 21 | 0 |
most_memorable_characteristics | 0 | 1.00 | 3 | 37 | 0 | 2487 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
ref | 0 | 1 | 1429.80 | 757.65 | 5 | 802 | 1454.00 | 2079.0 | 2712 | ▆▇▇▇▇ |
review_date | 0 | 1 | 2014.37 | 3.97 | 2006 | 2012 | 2015.00 | 2018.0 | 2021 | ▃▅▇▆▅ |
rating | 0 | 1 | 3.20 | 0.45 | 1 | 3 | 3.25 | 3.5 | 4 | ▁▁▅▇▇ |
data <- chocolate %>%
select(-ref, -review_date, -company_location, -specific_bean_origin_or_bar_name) %>%
na.omit() %>%
mutate(rating = as.numeric(rating), # Ensure numeric conversion
cocoa_percent = as.numeric(str_remove(cocoa_percent, "%")), # Convert cocoa_percent to numeric
rating = log(rating)) %>% # Log-transform rating for normalization
drop_na()
xgboost_recipe <- recipe(rating ~ ., data = data) %>%
update_role(company_manufacturer, new_role = "id") %>% # Set ID column
step_impute_median(all_numeric_predictors()) %>% # Handle missing numeric values
step_novel(all_nominal_predictors()) %>% # Handle unseen categorical values
step_dummy(all_nominal_predictors()) %>% # Convert categorical variables to dummies
step_zv(all_predictors()) # Remove zero-variance predictors
prep_xgboost_recipe <- prep(xgboost_recipe, training = data)
baked_data <- bake(prep_xgboost_recipe, new_data = NULL)
set.seed(123)
data_split <- initial_split(data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(123)
data_cv <- vfold_cv(train_data, v = 5)
set.seed(344)
xgboost_tune <- tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 5,
metrics = metric_set(rmse, mae, rsq)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
best_params <- select_best(xgboost_tune, metric = "rmse")
final_xgboost <- finalize_workflow(xgboost_workflow, best_params)
final_model <- fit(final_xgboost, data = train_data)
final_results <- predict(final_model, new_data = test_data) %>%
bind_cols(test_data) %>%
metrics(truth = rating, estimate = .pred)
print(final_results)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.140
## 2 rsq standard 0.0295
## 3 mae standard 0.112
final_results %>%
select(.metric, .estimate) %>%
print()
## # A tibble: 3 × 2
## .metric .estimate
## <chr> <dbl>
## 1 rmse 0.140
## 2 rsq 0.0295
## 3 mae 0.112
library(ggplot2)
predictions <- predict(final_model, new_data = test_data) %>%
bind_cols(test_data)
ggplot(predictions, aes(x = rating, y = .pred)) +
geom_point(alpha = 0.5) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Actual vs. Predicted Chocolate Ratings",
x = "Actual Rating",
y = "Predicted Rating") +
theme_minimal()