Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')

## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(chocolate)

Data summary
Name	chocolate
Number of rows	2530
Number of columns	10
_______________________
Column type frequency:
character	7
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
company_manufacturer	0	1.00	2	39	580
company_location	0	1.00	4	21	67
country_of_bean_origin	0	1.00	4	21	62
specific_bean_origin_or_bar_name	0	1.00	3	51	1605
cocoa_percent	0	1.00	3	6	46
ingredients	87	0.97	4	14	21
most_memorable_characteristics	0	1.00	3	37	2487

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ref	1	1429.80	757.65	5	802	1454.00	2079.0	2712	▆▇▇▇▇
review_date	1	2014.37	3.97	2006	2012	2015.00	2018.0	2021	▃▅▇▆▅
rating	1	3.20	0.45	1	3	3.25	3.5	4	▁▁▅▇▇

data <- chocolate %>%
  
  separate_rows(most_memorable_characteristics) %>% 
  

  
  # Treat missing values 
  select(-specific_bean_origin_or_bar_name) %>%
  na.omit() %>%
  
  # Log Transform Variables with pos-skewed Distribution
  mutate(rating = log(rating))

# Step 1: Prepare data 
data_binarized_tbl <- data %>%
  select(-review_date) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 8,403
## Columns: 103
## $ `ref__-Inf_833`                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__833_1482                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__1482_2122                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__2122_Inf                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_manufacturer__A._Morin             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Arete                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Bonnat               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Pralus               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Colombia                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Spain                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A.                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ country_of_bean_origin__Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Haiti              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ country_of_bean_origin__Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania           <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Trinidad           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A.             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__66%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa      <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ most_memorable_characteristics__coffee     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__floral     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spice      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__vanilla    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER`   <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 1, …
## $ `rating__-Inf_1.09861228866811`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165  <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ rating__1.17865499634165_1.25276296849537  <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ rating__1.25276296849537_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, …

# Step 2: Correlate 
data_corr_tbl <- data_binarized_tbl %>%
  correlate(rating__1.25276296849537_Inf)

data_corr_tbl

## # A tibble: 103 × 3
##    feature                        bin                               correlation
##    <fct>                          <chr>                                   <dbl>
##  1 rating                         1.25276296849537_Inf                   1     
##  2 rating                         -Inf_1.09861228866811                 -0.386 
##  3 rating                         1.17865499634165_1.25276296849537     -0.239 
##  4 rating                         1.09861228866811_1.17865499634165     -0.213 
##  5 company_manufacturer           -OTHER                                -0.144 
##  6 company_manufacturer           Soma                                   0.122 
##  7 most_memorable_characteristics creamy                                 0.110 
##  8 company_manufacturer           Bonnat                                 0.0941
##  9 cocoa_percent                  67%                                    0.0798
## 10 ingredients                    -OTHER                                -0.0640
## # ℹ 93 more rows

# Step 3: Plot 
data_corr_tbl %>%
  plot_correlation_funnel()

## Warning: ggrepel: 92 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model

Split Data

#data <- sample_n(data, 100)


# Split into train and test data set 
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation 
set.seed(12345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5671/631]> Fold01
##  2 <split [5671/631]> Fold02
##  3 <split [5672/630]> Fold03
##  4 <split [5672/630]> Fold04
##  5 <split [5672/630]> Fold05
##  6 <split [5672/630]> Fold06
##  7 <split [5672/630]> Fold07
##  8 <split [5672/630]> Fold08
##  9 <split [5672/630]> Fold09
## 10 <split [5672/630]> Fold10

library(usemodels)
usemodels::use_xgboost(rating ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = rating ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(98210)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = rating ~ ., data = data_train) %>% 
  recipes::update_role(ref, new_role = "id") %>%
  step_other(company_manufacturer, company_location, country_of_bean_origin, most_memorable_characteristics, threshold = 0.02) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE)
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 6,302
## Columns: 108
## $ ref                                       <dbl> 1796, 2190, 967, 2182, 1828,…
## $ review_date                               <dbl> 2016, 2018, 2012, 2018, 2016…
## $ rating                                    <dbl> 1.0116009, 1.1786550, 1.2527…
## $ company_manufacturer_Soma                 <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ company_manufacturer_other                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1…
## $ company_location_Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Canada                   <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ company_location_Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.S.A.                   <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 1, 0…
## $ company_location_other                    <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 0, 1…
## $ country_of_bean_origin_Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Blend              <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Colombia           <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ country_of_bean_origin_Dominican.Republic <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ country_of_bean_origin_Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Papua.New.Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Tanzania           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Venezuela          <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ country_of_bean_origin_Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_other              <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ cocoa_percent_X100.                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X42.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X46.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X50.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X53.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X55.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X56.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X57.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X58.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X61.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X62.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X63.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X64.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X65.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X66.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X67.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X68.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ cocoa_percent_X69.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X70.                        <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0…
## $ cocoa_percent_X71.50.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X71.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ cocoa_percent_X73.5.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X73.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X74.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X75.                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ cocoa_percent_X76.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ cocoa_percent_X77.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X78.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X79.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X80.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X81.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X82.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X83.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X84.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X85.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X86.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X87.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X88.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X89.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X90.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X91.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X99.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X1..B                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.C                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.S                       <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 1…
## $ ingredients_X2..B.S.                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.C                     <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 0, 0…
## $ ingredients_X3..B.S.L                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.V                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..C                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..Sa                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.L                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.Sa                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.V                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.V.L                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.L                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.Sa                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.V                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..V.L                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.L.Sa                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.V.L                 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5.B.S.C.V.Sa                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X6.B.S.C.V.L.Sa               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_cocoa      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_nutty      <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ most_memorable_characteristics_roasty     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_other      <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1…

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(50194)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)

## → A | warning: ! There are new levels in a factor: 53%

## 
There were issues with some computations   A: x1

                                                 
→ B | warning: ! There are new levels in a factor: 60.5%
## There were issues with some computations   A: x1

There were issues with some computations   A: x1   B: x1

                                                         
→ C | warning: ! There are new levels in a factor: 99%
## There were issues with some computations   A: x1   B: x1

There were issues with some computations   A: x1   B: x1   C: x1

                                                                 
→ D | warning: ! There are new levels in a factor: 87%
## There were issues with some computations   A: x1   B: x1   C: x1

There were issues with some computations   A: x1   B: x1   C: x1   D: x1

There were issues with some computations   A: x1   B: x1   C: x1   D: x1

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 7
##   trees .metric .estimator   mean     n std_err .config             
##   <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1  1294 rmse    standard   0.0803    10 0.00172 Preprocessor1_Model4
## 2  1818 rmse    standard   0.0803    10 0.00172 Preprocessor1_Model5
## 3  1175 rmse    standard   0.0804    10 0.00172 Preprocessor1_Model3
## 4   594 rmse    standard   0.0817    10 0.00164 Preprocessor1_Model2
## 5    65 rmse    standard   0.108     10 0.00101 Preprocessor1_Model1

# Update the model by selecting the best hyper-parameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire traing data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0715 Preprocessor1_Model1
## 2 rsq     standard      0.756  Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
  ggplot(aes(rating, .pred)) +
  geom_point(alpha = 0.3, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed()

Conclusion

After looking over the data set and trying a few possibilities, I ended up taking out the code separate rows for specific bean origin. I didn’t see a reason to separate these rows and it brought down the number of rows in my data set by over ten thousand. Unfortunately by making this change my RMSE increased slightly. My RMSE increased from .0487 to .0715 but RSQ decreased from .871 to .756. After looking at both data sets my first chart looked better.

Apply 4

Spencer Murrin

2024-02-29

Import Data

Build Model

Evaluate Models

Conclusion