Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')

## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(chocolate)

Data summary
Name	chocolate
Number of rows	2530
Number of columns	10
_______________________
Column type frequency:
character	7
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
company_manufacturer	0	1.00	2	39	580
company_location	0	1.00	4	21	67
country_of_bean_origin	0	1.00	4	21	62
specific_bean_origin_or_bar_name	0	1.00	3	51	1605
cocoa_percent	0	1.00	3	6	46
ingredients	87	0.97	4	14	21
most_memorable_characteristics	0	1.00	3	37	2487

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ref	1	1429.80	757.65	5	802	1454.00	2079.0	2712	▆▇▇▇▇
review_date	1	2014.37	3.97	2006	2012	2015.00	2018.0	2021	▃▅▇▆▅
rating	1	3.20	0.45	1	3	3.25	3.5	4	▁▁▅▇▇

data <- chocolate %>%
  
  separate_rows(most_memorable_characteristics) %>% 
  
  separate_rows(specific_bean_origin_or_bar_name) %>%
  
  # Treat missing values 
  select(-specific_bean_origin_or_bar_name) %>%
  na.omit() %>%
  
  # Log Transform Variables with pos-skewed Distribution
  mutate(rating = log(rating))

# Step 1: Prepare data 
data_binarized_tbl <- data %>%
  select(-review_date) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 23,520
## Columns: 99
## $ `ref__-Inf_947`                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__947_1590                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__1590_2274                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__2274_Inf                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_manufacturer__Bonnat               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Castronovo           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Dandelion            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A.                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Jamaica            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Philippines        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ country_of_bean_origin__Trinidad           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A.             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa      <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, …
## $ most_memorable_characteristics__creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__dried      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ most_memorable_characteristics__floral     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich       <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__tart       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `rating__-Inf_1.09861228866811`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rating__1.17865499634165_1.25276296849537  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.25276296849537_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlate 
data_corr_tbl <- data_binarized_tbl %>%
  correlate(rating__1.25276296849537_Inf)

data_corr_tbl

## # A tibble: 99 × 3
##    feature                        bin                               correlation
##    <fct>                          <chr>                                   <dbl>
##  1 rating                         1.25276296849537_Inf                   1     
##  2 rating                         -Inf_1.09861228866811                 -0.384 
##  3 rating                         1.17865499634165_1.25276296849537     -0.270 
##  4 rating                         1.09861228866811_1.17865499634165     -0.237 
##  5 company_manufacturer           -OTHER                                -0.153 
##  6 company_manufacturer           Soma                                   0.149 
##  7 most_memorable_characteristics creamy                                 0.104 
##  8 company_manufacturer           Bonnat                                 0.0903
##  9 cocoa_percent                  67%                                    0.0897
## 10 company_location               Canada                                 0.0808
## # ℹ 89 more rows

# Step 3: Plot 
data_corr_tbl %>%
  plot_correlation_funnel()

## Warning: ggrepel: 85 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model

Split Data

# data <- sample_n(data, 100)


# Split into train and test data set 
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation 
set.seed(12345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [15876/1764]> Fold01
##  2 <split [15876/1764]> Fold02
##  3 <split [15876/1764]> Fold03
##  4 <split [15876/1764]> Fold04
##  5 <split [15876/1764]> Fold05
##  6 <split [15876/1764]> Fold06
##  7 <split [15876/1764]> Fold07
##  8 <split [15876/1764]> Fold08
##  9 <split [15876/1764]> Fold09
## 10 <split [15876/1764]> Fold10

library(usemodels)
usemodels::use_xgboost(rating ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = rating ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6405)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = rating ~ ., data = data_train) %>% 
  recipes::update_role(ref, new_role = "id") %>%
  step_other(company_manufacturer, company_location, country_of_bean_origin, most_memorable_characteristics, threshold = 0.02) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE)
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 17,640
## Columns: 111
## $ ref                                       <dbl> 1275, 2318, 439, 2044, 2126,…
## $ review_date                               <dbl> 2014, 2019, 2009, 2018, 2018…
## $ rating                                    <dbl> 1.3217558, 1.1786550, 1.0986…
## $ company_manufacturer_Fresco               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_manufacturer_Soma                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_manufacturer_other                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ company_location_Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Canada                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Switzerland              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ company_location_U.S.A.                   <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 1, 1…
## $ company_location_other                    <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ country_of_bean_origin_Blend              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Colombia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Dominican.Republic <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ country_of_bean_origin_Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Nicaragua          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Papua.New.Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Peru               <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ country_of_bean_origin_Tanzania           <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_U.S.A.             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Venezuela          <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_other              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ cocoa_percent_X100.                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X42.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X46.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X50.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X53.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X55.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X56.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X57.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X58.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.                        <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ cocoa_percent_X61.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X62.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X63.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X64.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X65.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X66.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X67.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X68.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X69.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X70.                        <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 0, 1…
## $ cocoa_percent_X71.50.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X71.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X73.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X73.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X74.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X75.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ cocoa_percent_X76.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X77.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X78.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X79.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X80.                        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X81.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X82.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X83.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X84.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X85.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X86.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X87.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X88.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X89.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X90.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X91.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X99.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X1..B                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.C                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.S                       <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 1…
## $ ingredients_X2..B.S.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.C                     <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 0, 0…
## $ ingredients_X3..B.S.L                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.V                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..C                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..Sa                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.L                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.Sa                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.V                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.V.L                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.L                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.Sa                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.V                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..V.L                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.L.Sa                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.V.L                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5.B.S.C.V.Sa                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X6.B.S.C.V.L.Sa               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_cocoa      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_floral     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_nutty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_roasty     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_other      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(50194)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 7
##   trees .metric .estimator   mean     n  std_err .config             
##   <int> <chr>   <chr>       <dbl> <int>    <dbl> <chr>               
## 1  1818 rmse    standard   0.0501    10 0.000478 Preprocessor1_Model5
## 2  1294 rmse    standard   0.0504    10 0.000476 Preprocessor1_Model4
## 3  1175 rmse    standard   0.0507    10 0.000440 Preprocessor1_Model3
## 4   594 rmse    standard   0.0544    10 0.000458 Preprocessor1_Model2
## 5    65 rmse    standard   0.0884    10 0.000579 Preprocessor1_Model1

# Update the model by selecting the best hyper-parameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire traing data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0487 Preprocessor1_Model1
## 2 rsq     standard      0.871  Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
  ggplot(aes(rating, .pred)) +
  geom_point(alpha = 0.3, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed()

Apply 3

Spencer Murrin

2024-02-22

Import Data

Build Model

Evaluate Models