CA2: Anton Jellvik

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(tidyverse)

url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv"
chocolate <- read_csv(url)

## Rows: 2530 Columns: 10

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

chocolate %>%
  ggplot(aes(rating)) +
  geom_histogram(bins = 15)

Data Exploration

chocolate %>%
  ggplot(aes(rating)) +
  geom_histogram(bins = 15)

library(tidytext)

tidy_chocolate <-
  chocolate %>%
  unnest_tokens(word, most_memorable_characteristics)

tidy_chocolate %>%
  count(word, sort = TRUE)

## # A tibble: 547 × 2
##    word        n
##    <chr>   <int>
##  1 cocoa     419
##  2 sweet     318
##  3 nutty     278
##  4 fruit     273
##  5 roasty    228
##  6 mild      226
##  7 sour      208
##  8 earthy    199
##  9 creamy    189
## 10 intense   178
## # ℹ 537 more rows

tidy_chocolate %>%
  group_by(word) %>%
  summarise(n = n(),
            rating = mean(rating)) %>%
  ggplot(aes(n, rating)) +
  geom_hline(yintercept = mean(chocolate$rating),
             lty = 2, color = "gray50", size = 1.5) +
  geom_point(color = "midnightblue", alpha = 0.7) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = "top", hjust = "left") +
  scale_x_log10()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Splitting up Data

library(tidymodels)
set.seed(123)
choco_split <- initial_split(chocolate, strata = rating)
choco_train <- training(choco_split)
choco_test <- testing(choco_split)

set.seed(234)
choco_folds <- vfold_cv(choco_train, strata = rating)
choco_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1705/191]> Fold01
##  2 <split [1705/191]> Fold02
##  3 <split [1705/191]> Fold03
##  4 <split [1706/190]> Fold04
##  5 <split [1706/190]> Fold05
##  6 <split [1706/190]> Fold06
##  7 <split [1707/189]> Fold07
##  8 <split [1707/189]> Fold08
##  9 <split [1708/188]> Fold09
## 10 <split [1709/187]> Fold10

Preprocessing

library(textrecipes)

choco_rec <-
  recipe(rating ~ most_memorable_characteristics, data = choco_train) %>%
  step_tokenize(most_memorable_characteristics) %>%
  step_tokenfilter(most_memorable_characteristics, max_tokens = 100) %>%
  step_tf(most_memorable_characteristics)

Model Specification

rf_spec <-
  rand_forest(trees = 500) %>%
  set_mode("regression")

rf_spec

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger

svm_spec <-
  svm_linear() %>%
  set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

Model Workflow

svm_wf <- workflow(choco_rec, svm_spec)
rf_wf <- workflow(choco_rec, rf_spec)

Evaluate models

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = choco_folds,
  control = contrl_preds
)

ranger_rs <- fit_resamples(
  rf_wf,
  resamples = choco_folds,
  control = contrl_preds
)

Model comparison

collect_metrics(svm_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.348    10 0.00704 Preprocessor1_Model1
## 2 rsq     standard   0.365    10 0.0146  Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.345    10 0.00726 Preprocessor1_Model1
## 2 rsq     standard   0.378    10 0.0151  Preprocessor1_Model1

Vizualize the results

bind_rows(
  collect_predictions(svm_rs) %>%
    mutate(mod = "SVM"),
  collect_predictions(ranger_rs) %>%
    mutate(mod = "ranger")
) %>%
  ggplot(aes(rating, .pred, color = id)) +
  geom_abline(lty = 2, color = "gray50", size = 1.2) +
  geom_jitter(width = 0.5, alpha = 0.5) +
  facet_wrap(vars(mod)) +
  coord_fixed()

final_fitted <- last_fit(svm_wf, choco_split)
collect_metrics(final_fitted) ## metrics evaluated on the *testing* data

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.385 Preprocessor1_Model1
## 2 rsq     standard       0.340 Preprocessor1_Model1

This object contains a fitted workflow that we can use for prediction.

final_wf <- extract_workflow(final_fitted)
predict(final_wf, choco_test[55, ])

## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  3.70

extract_workflow(final_fitted) %>%
  tidy() %>%
  filter(term != "Bias") %>%
  group_by(estimate > 0) %>%
  slice_max(abs(estimate), n = 10) %>%
  ungroup() %>%
  mutate(term = str_remove(term, "tf_most_memorable_characteristics_")) %>%
  ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
  geom_col(alpha = 0.8) +
  scale_fill_discrete(labels = c("low ratings", "high ratings")) +
  labs(y = NULL, fill = "More from...")

Questions

The question we are trying to answer using the chocolate dataset is if we can predict the rating for different chocolates based on the specific words used and we also visualized in order which words were most likely to recive the best ratings based on our dataset.

The dataset is relativley small and is made up of 10 columns and 2530 entries, the columns are things such as company loaction, bean origin country, rating, most memorable characteristics among others.

The variables we used in the analysis were both numerical and characters, we used the ratings for the chocolate in combination with the specific words used to describe the chocolate and based on that we tried to predict ruture chocolate ratings.

The data we used from the “most_memorable_characteristics” column was text and therefore we tokenized that data in order for it to be useful for the ML algorithm.We also sorted the data by term frequency after we separated all the words from the same review so each word was its own separate string.
We performed tokenization which is where we split the “most_memorable_characteristics” column into separate words like I mentioned above. We also filted the tokens based on the number of times a word was used and set the minimum to 100 in order to be able to get useful data from them.

The models used in the analysis were support vector machine(SVM) and a Random Forest model.

To evaluate the models we used the rmse and rsq toevaluate their performance. Based on the results we got this algorightm did not perform very well in predicting the ratings of chocolate.
The analysis delves into the relationship between chocolate ratings and their defining characteristics. Using visuals and tokenization, common descriptors from reviews are identified, and their correlation with ratings is highlighted. Two models, SVM and Random Forest, predict these ratings, with performance metrics indicating their accuracy. Notably, specific descriptors that significantly influence ratings, either positively or negatively, are discerned, offering a deeper understanding of what determines a chocolate’s rating.