Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
Data summary
Name chocolate
Number of rows 2530
Number of columns 10
_______________________
Column type frequency:
character 7
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
company_manufacturer 0 1.00 2 39 0 580 0
company_location 0 1.00 4 21 0 67 0
country_of_bean_origin 0 1.00 4 21 0 62 0
specific_bean_origin_or_bar_name 0 1.00 3 51 0 1605 0
cocoa_percent 0 1.00 3 6 0 46 0
ingredients 87 0.97 4 14 0 21 0
most_memorable_characteristics 0 1.00 3 37 0 2487 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ref 0 1 1429.80 757.65 5 802 1454.00 2079.0 2712 ▆▇▇▇▇
review_date 0 1 2014.37 3.97 2006 2012 2015.00 2018.0 2021 ▃▅▇▆▅
rating 0 1 3.20 0.45 1 3 3.25 3.5 4 ▁▁▅▇▇
data <- chocolate %>%
  
  separate_rows(most_memorable_characteristics) %>% 
  

  
  # Treat missing values 
  select(-specific_bean_origin_or_bar_name) %>%
  na.omit() %>%
  
  # Log Transform Variables with pos-skewed Distribution
  mutate(rating = log(rating))
# Step 1: Prepare data 
data_binarized_tbl <- data %>%
  select(-review_date) %>%
  binarize()

data_binarized_tbl %>% glimpse()
## Rows: 8,403
## Columns: 103
## $ `ref__-Inf_833`                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__833_1482                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__1482_2122                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__2122_Inf                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_manufacturer__A._Morin             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Arete                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Bonnat               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Pralus               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Colombia                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Spain                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A.                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ country_of_bean_origin__Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Haiti              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ country_of_bean_origin__Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania           <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Trinidad           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A.             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__66%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa      <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ most_memorable_characteristics__coffee     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__floral     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spice      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__vanilla    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER`   <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 1, …
## $ `rating__-Inf_1.09861228866811`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165  <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ rating__1.17865499634165_1.25276296849537  <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ rating__1.25276296849537_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, …
# Step 2: Correlate 
data_corr_tbl <- data_binarized_tbl %>%
  correlate(rating__1.25276296849537_Inf)

data_corr_tbl
## # A tibble: 103 × 3
##    feature                        bin                               correlation
##    <fct>                          <chr>                                   <dbl>
##  1 rating                         1.25276296849537_Inf                   1     
##  2 rating                         -Inf_1.09861228866811                 -0.386 
##  3 rating                         1.17865499634165_1.25276296849537     -0.239 
##  4 rating                         1.09861228866811_1.17865499634165     -0.213 
##  5 company_manufacturer           -OTHER                                -0.144 
##  6 company_manufacturer           Soma                                   0.122 
##  7 most_memorable_characteristics creamy                                 0.110 
##  8 company_manufacturer           Bonnat                                 0.0941
##  9 cocoa_percent                  67%                                    0.0798
## 10 ingredients                    -OTHER                                -0.0640
## # ℹ 93 more rows
# Step 3: Plot 
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: ggrepel: 92 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model

Split Data

#data <- sample_n(data, 100)


# Split into train and test data set 
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation 
set.seed(12345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5671/631]> Fold01
##  2 <split [5671/631]> Fold02
##  3 <split [5672/630]> Fold03
##  4 <split [5672/630]> Fold04
##  5 <split [5672/630]> Fold05
##  6 <split [5672/630]> Fold06
##  7 <split [5672/630]> Fold07
##  8 <split [5672/630]> Fold08
##  9 <split [5672/630]> Fold09
## 10 <split [5672/630]> Fold10
library(usemodels)
usemodels::use_xgboost(rating ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = rating ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(98210)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- 
  recipe(formula = rating ~ ., data = data_train) %>% 
  recipes::update_role(ref, new_role = "id") %>%
  step_other(company_manufacturer, company_location, country_of_bean_origin, most_memorable_characteristics, threshold = 0.02) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE)
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 6,302
## Columns: 108
## $ ref                                       <dbl> 1796, 2190, 967, 2182, 1828,…
## $ review_date                               <dbl> 2016, 2018, 2012, 2018, 2016…
## $ rating                                    <dbl> 1.0116009, 1.1786550, 1.2527…
## $ company_manufacturer_Soma                 <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ company_manufacturer_other                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1…
## $ company_location_Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Canada                   <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ company_location_Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.S.A.                   <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 1, 0…
## $ company_location_other                    <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 0, 1…
## $ country_of_bean_origin_Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Blend              <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Colombia           <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ country_of_bean_origin_Dominican.Republic <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ country_of_bean_origin_Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Papua.New.Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Tanzania           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Venezuela          <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ country_of_bean_origin_Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_other              <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ cocoa_percent_X100.                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X42.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X46.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X50.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X53.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X55.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X56.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X57.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X58.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X61.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X62.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X63.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X64.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X65.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X66.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X67.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X68.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ cocoa_percent_X69.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X70.                        <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0…
## $ cocoa_percent_X71.50.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X71.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ cocoa_percent_X73.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X73.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X74.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X75.                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ cocoa_percent_X76.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ cocoa_percent_X77.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X78.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X79.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X80.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X81.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X82.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X83.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X84.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X85.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X86.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X87.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X88.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X89.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X90.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X91.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X99.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X1..B                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.C                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.S                       <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 1…
## $ ingredients_X2..B.S.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.C                     <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 0, 0…
## $ ingredients_X3..B.S.L                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.V                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..C                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..Sa                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.L                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.Sa                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.V                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.V.L                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.L                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.Sa                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.V                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..V.L                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.L.Sa                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.V.L                 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5.B.S.C.V.Sa                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X6.B.S.C.V.L.Sa               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_cocoa      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_nutty      <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ most_memorable_characteristics_roasty     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_other      <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1…
xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(50194)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## → A | warning: ! There are new levels in a factor: 53%
## 
There were issues with some computations   A: x1

                                                 
→ B | warning: ! There are new levels in a factor: 60.5%
## There were issues with some computations   A: x1

There were issues with some computations   A: x1   B: x1

                                                         
→ C | warning: ! There are new levels in a factor: 99%
## There were issues with some computations   A: x1   B: x1

There were issues with some computations   A: x1   B: x1   C: x1

                                                                 
→ D | warning: ! There are new levels in a factor: 87%
## There were issues with some computations   A: x1   B: x1   C: x1

There were issues with some computations   A: x1   B: x1   C: x1   D: x1

There were issues with some computations   A: x1   B: x1   C: x1   D: x1

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 7
##   trees .metric .estimator   mean     n std_err .config             
##   <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1  1294 rmse    standard   0.0803    10 0.00172 Preprocessor1_Model4
## 2  1818 rmse    standard   0.0803    10 0.00172 Preprocessor1_Model5
## 3  1175 rmse    standard   0.0804    10 0.00172 Preprocessor1_Model3
## 4   594 rmse    standard   0.0817    10 0.00164 Preprocessor1_Model2
## 5    65 rmse    standard   0.108     10 0.00101 Preprocessor1_Model1
# Update the model by selecting the best hyper-parameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire traing data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0715 Preprocessor1_Model1
## 2 rsq     standard      0.756  Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
  ggplot(aes(rating, .pred)) +
  geom_point(alpha = 0.3, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed() 

Conclusion

After looking over the data set and trying a few possibilities, I ended up taking out the code separate rows for specific bean origin. I didn’t see a reason to separate these rows and it brought down the number of rows in my data set by over ten thousand. Unfortunately by making this change my RMSE increased slightly. My RMSE increased from .0487 to .0715 but RSQ decreased from .871 to .756. After looking at both data sets my first chart looked better.