url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv"
chocolate <- read_csv(url)
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

EDA Exploratory Data

chocolate %>%
    ggplot(aes(rating)) +
    geom_histogram(bins = 15)

library(tidytext)
tidy_chocolate <- chocolate %>%
    unnest_tokens(word, most_memorable_characteristics)

tidy_chocolate %>%
    count(word, sort = TRUE)
## # A tibble: 547 × 2
##    word        n
##    <chr>   <int>
##  1 cocoa     419
##  2 sweet     318
##  3 nutty     278
##  4 fruit     273
##  5 roasty    228
##  6 mild      226
##  7 sour      208
##  8 earthy    199
##  9 creamy    189
## 10 intense   178
## # ℹ 537 more rows
tidy_chocolate %>%
    group_by(word) %>%
    summarise(n = n(),
              rating = mean(rating)) %>%
    ggplot(aes(n, rating)) +
    geom_hline(yintercept = mean(chocolate$rating),
               lty = 2, color = "gray50", size = 1.2) +
    geom_point(color = "midnightblue", alpha = 0.7) +
    geom_text(aes(label = word),
              check_overlap = TRUE, vjust = "top", hjust = "left") +
    scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Build Model

set.seed(123)
Choco_split <- initial_split(chocolate, strata = rating)
choco_train <- training(Choco_split)
choco_test <- testing(Choco_split)

set.seed(1122)
choco_folds <- vfold_cv(choco_train, strata = rating)
choco_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1705/191]> Fold01
##  2 <split [1705/191]> Fold02
##  3 <split [1705/191]> Fold03
##  4 <split [1706/190]> Fold04
##  5 <split [1706/190]> Fold05
##  6 <split [1706/190]> Fold06
##  7 <split [1707/189]> Fold07
##  8 <split [1707/189]> Fold08
##  9 <split [1708/188]> Fold09
## 10 <split [1709/187]> Fold10

Preprocessing

library(textrecipes)
library(usemodels)

choco_rec <- recipe(rating ~ most_memorable_characteristics, data = choco_train) %>%
    step_tokenize(most_memorable_characteristics) %>%
    step_tokenfilter(most_memorable_characteristics, max_tokens = 100) %>%
    step_tf(most_memorable_characteristics)

Model Spoecification

ranger_spec <- rand_forest(trees = 500) %>%
    set_mode("regression")

ranger_spec
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger
svm_spec <-
    svm_linear() %>%
    set_mode("regression")

svm_spec
## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR
ranger_wf <- workflow(choco_rec, ranger_spec)
svm_wf <- workflow(choco_rec, svm_spec)
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
    svm_wf,
    resamples = choco_folds,
    control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.3.3
ranger_rs <- fit_resamples(
    ranger_wf,
    resamples = choco_folds,
    control = contrl_preds
)
collect_metrics(svm_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.349    10 0.00515 Preprocessor1_Model1
## 2 rsq     standard   0.363    10 0.0144  Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.345    10 0.00599 Preprocessor1_Model1
## 2 rsq     standard   0.378    10 0.0175  Preprocessor1_Model1
# visualize

bind_rows(
    collect_predictions(svm_rs) %>%
        mutate(mod = "SVM"),
    collect_predictions(ranger_rs) %>%
        mutate(mod = "ranger")
) %>%
    ggplot(aes(rating, .pred, color = id)) +
    geom_abline(lty = 2, color = "gray50", size = 1.2) +
    geom_jitter(alpha = 0.5) +
    facet_wrap(vars(mod)) +
    coord_fixed()

final_fitted <- last_fit(svm_wf, Choco_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.385 Preprocessor1_Model1
## 2 rsq     standard       0.340 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, choco_test[55,])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  3.70
predict(final_wf, choco_test[60,])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  3.31
extract_workflow(final_fitted) %>%
    tidy() %>%
    filter() %>%
    group_by(estimate > 0) %>%
    slice_max(abs(estimate), n = 10) %>%
    ungroup() %>%
    mutate(term = str_remove(term, "tf_most_memorable_characteristics_")) %>%
    ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
    geom_col(alpha = 0.8)