Goal: to predict ratings for chocolate Click here for data
library(tidyverse)
url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv"
chocolate <- read_csv(url)
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chocolate %>%
ggplot(aes(rating)) +
geom_histogram(bins = 15)
chocolate %>%
ggplot(aes(rating)) +
geom_histogram(bins = 15)
library(tidytext)
tidy_chocolate <-
chocolate %>%
unnest_tokens(word, most_memorable_characteristics)
tidy_chocolate %>%
count(word, sort = TRUE)
## # A tibble: 547 × 2
## word n
## <chr> <int>
## 1 cocoa 419
## 2 sweet 318
## 3 nutty 278
## 4 fruit 273
## 5 roasty 228
## 6 mild 226
## 7 sour 208
## 8 earthy 199
## 9 creamy 189
## 10 intense 178
## # ℹ 537 more rows
tidy_chocolate %>%
group_by(word) %>%
summarise(n = n(),
rating = mean(rating)) %>%
ggplot(aes(n, rating)) +
geom_hline(yintercept = mean(chocolate$rating),
lty = 2, color = "gray50", size = 1.2) +
geom_point(color = "midnightblue", alpha = 0.7) +
geom_text(aes(label = word), family = "IBMPlexSans",
check_overlap = TRUE, vjust = "top", hjust = "left")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
scale_x_log10()
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.1 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.2.0 ✔ yardstick 1.3.0
## ✔ recipes 1.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
choco_split <- initial_split(chocolate, strata = rating)
choco_train <- training(choco_split)
choco_test <- testing(choco_split)
set.seed(234)
choco_folds <- vfold_cv(choco_train, strata = rating)
choco_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [1705/191]> Fold01
## 2 <split [1705/191]> Fold02
## 3 <split [1705/191]> Fold03
## 4 <split [1706/190]> Fold04
## 5 <split [1706/190]> Fold05
## 6 <split [1706/190]> Fold06
## 7 <split [1707/189]> Fold07
## 8 <split [1707/189]> Fold08
## 9 <split [1708/188]> Fold09
## 10 <split [1709/187]> Fold10
Preprocessing:
library(textrecipes)
choco_rec <-
recipe(rating ~ most_memorable_characteristics, data = choco_train) %>%
step_tokenize(most_memorable_characteristics) %>%
step_tokenfilter(most_memorable_characteristics, max_tokens = 100) %>%
step_tf(most_memorable_characteristics)
Let’s create a model specification
ranger_spec <-
rand_forest(trees = 500) %>%
set_mode("regression")
ranger_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## trees = 500
##
## Computational engine: ranger
svm_spec <-
svm_linear() %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
ranger_wf <- workflow(choco_rec, ranger_spec)
svm_wf <- workflow(choco_rec, svm_spec)
doParallel::registerDoParallel
## function (cl, cores = NULL, ...)
## {
## opts <- list(...)
## optnames <- names(opts)
## if (is.null(optnames))
## optnames <- rep("", length(opts))
## unnamed <- !nzchar(optnames)
## if (any(unnamed)) {
## warning("ignoring doParallel package option(s) specified with unnamed argument")
## opts <- opts[!unnamed]
## optnames <- optnames[!unnamed]
## }
## recog <- optnames %in% c("nocompile")
## if (any(!recog)) {
## warning(sprintf("ignoring unrecognized doParallel package option(s): %s",
## paste(optnames[!recog], collapse = ", ")), call. = FALSE)
## opts <- opts[recog]
## optnames <- optnames[recog]
## }
## old.optnames <- ls(.options, all.names = TRUE)
## rm(list = old.optnames, pos = .options)
## for (i in seq_along(opts)) {
## assign(optnames[i], opts[[i]], pos = .options)
## }
## if (missing(cl) || is.numeric(cl)) {
## if (.Platform$OS.type == "windows") {
## if (!missing(cl) && is.numeric(cl)) {
## cl <- makeCluster(cl)
## }
## else {
## if (!missing(cores) && is.numeric(cores)) {
## cl <- makeCluster(cores)
## }
## else {
## cl <- makeCluster(3)
## }
## }
## assign(".revoDoParCluster", cl, pos = .options)
## reg.finalizer(.options, function(e) {
## stopImplicitCluster()
## }, onexit = TRUE)
## setDoPar(doParallelSNOW, cl, snowinfo)
## }
## else {
## if (!missing(cl) && is.numeric(cl)) {
## cores <- cl
## }
## setDoPar(doParallelMC, cores, mcinfo)
## }
## }
## else {
## setDoPar(doParallelSNOW, cl, snowinfo)
## }
## }
## <bytecode: 0x0000027566014468>
## <environment: namespace:doParallel>
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- fit_resamples(
svm_wf,
resamples = choco_folds,
control = contrl_preds
)
ranger_rs <- fit_resamples(
ranger_wf,
resamples = choco_folds,
control = contrl_preds
)
collect_metrics(svm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.348 10 0.00704 Preprocessor1_Model1
## 2 rsq standard 0.365 10 0.0146 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.344 10 0.00726 Preprocessor1_Model1
## 2 rsq standard 0.379 10 0.0152 Preprocessor1_Model1
bind_rows(
collect_predictions(svm_rs) %>%
mutate(mod = "SVM"),
collect_predictions(ranger_rs) %>%
mutate(mod = "ranger")
) %>%
ggplot(aes(rating, .pred, color = id)) +
geom_abline(lty = 2, color = "gray50", size = 1.2) +
geom_jitter(width = 0.5, alpha = 0.5) +
facet_wrap(vars(mod)) +
coord_fixed()
final_fitted <- last_fit(svm_wf, choco_split)
collect_metrics(final_fitted) ## metrics evaluated on the *testing* data
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.385 Preprocessor1_Model1
## 2 rsq standard 0.340 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, choco_test[55, ])
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 3.70
extract_workflow(final_fitted) %>%
tidy() %>%
filter(term != "Bias") %>%
group_by(estimate > 0) %>%
slice_max(abs(estimate), n = 10) %>%
ungroup() %>%
mutate(term = str_remove(term, "tf_most_memorable_characteristics_")) %>%
ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
geom_col(alpha = 0.8)