Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')

## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(chocolate)

Data summary
Name	chocolate
Number of rows	2530
Number of columns	10
_______________________
Column type frequency:
character	7
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
company_manufacturer	0	1.00	2	39	580
company_location	0	1.00	4	21	67
country_of_bean_origin	0	1.00	4	21	62
specific_bean_origin_or_bar_name	0	1.00	3	51	1605
cocoa_percent	0	1.00	3	6	46
ingredients	87	0.97	4	14	21
most_memorable_characteristics	0	1.00	3	37	2487

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ref	1	1429.80	757.65	5	802	1454.00	2079.0	2712	▆▇▇▇▇
review_date	1	2014.37	3.97	2006	2012	2015.00	2018.0	2021	▃▅▇▆▅
rating	1	3.20	0.45	1	3	3.25	3.5	4	▁▁▅▇▇

data <- chocolate %>%
  
  separate_rows(most_memorable_characteristics) %>% 
  
  separate_rows(specific_bean_origin_or_bar_name) %>%
  
  # Treat missing values 
  select(-specific_bean_origin_or_bar_name, -ref) %>%
  na.omit() %>%
  
  # Log Transform Variables with pos-skewed Distribution
  mutate(rating = log(rating))

# Step 1: Prepare data 
data_binarized_tbl <- data %>%
  select(-review_date) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 23,520
## Columns: 95
## $ company_manufacturer__Bonnat               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Castronovo           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Dandelion            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A.                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Jamaica            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Philippines        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ country_of_bean_origin__Trinidad           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A.             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa      <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, …
## $ most_memorable_characteristics__creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__dried      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ most_memorable_characteristics__floral     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich       <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__tart       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `rating__-Inf_1.09861228866811`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rating__1.17865499634165_1.25276296849537  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.25276296849537_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlate 
data_corr_tbl <- data_binarized_tbl %>%
  correlate(rating__1.25276296849537_Inf)

data_corr_tbl

## # A tibble: 95 × 3
##    feature                        bin                               correlation
##    <fct>                          <chr>                                   <dbl>
##  1 rating                         1.25276296849537_Inf                   1     
##  2 rating                         -Inf_1.09861228866811                 -0.384 
##  3 rating                         1.17865499634165_1.25276296849537     -0.270 
##  4 rating                         1.09861228866811_1.17865499634165     -0.237 
##  5 company_manufacturer           -OTHER                                -0.153 
##  6 company_manufacturer           Soma                                   0.149 
##  7 most_memorable_characteristics creamy                                 0.104 
##  8 company_manufacturer           Bonnat                                 0.0903
##  9 cocoa_percent                  67%                                    0.0897
## 10 company_location               Canada                                 0.0808
## # ℹ 85 more rows

# Step 3: Plot 
data_corr_tbl %>%
  plot_correlation_funnel()

## Warning: ggrepel: 85 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model

Split Data

data <- sample_n(data, 100)


# Split into train and test data set 
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation 
set.seed(12345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(usemodels)
usemodels::use_xgboost(rating ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = rating ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(39943)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = rating ~ ., data = data_train) %>% 
  recipes::update_role(country_of_bean_origin, new_role = "id") %>%
  step_tokenize(most_memorable_characteristics) %>%
  step_tokenfilter(most_memorable_characteristics, max_tokens = 100) %>% 
  step_tfidf(most_memorable_characteristics) %>% 
  step_other(company_manufacturer, company_location) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_YeoJohnson(review_date)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Warning: max_tokens was set to '100', but only 53 was available and selected.

## Rows: 75
## Columns: 86
## $ review_date                                       <dbl> 2012, 2018, 2009, 20…
## $ country_of_bean_origin                            <fct> Dominican Republic, …
## $ rating                                            <dbl> 1.178655, 1.252763, …
## $ tfidf_most_memorable_characteristics_anise        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_astringent   <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_balanced     <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_banana       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_basic        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_berry        <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_butterscotch <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_cherry       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_chewy        <dbl> 4.330733, 0.000000, …
## $ tfidf_most_memorable_characteristics_classic      <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_coarse       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_cocoa        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_creamy       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_dark         <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_delicate     <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_dried        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_dry          <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_earthy       <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_fatty        <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_few          <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_floral       <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_fruit        <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_full         <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_gateway      <dbl> 0.000000, 4.330733, …
## $ tfidf_most_memorable_characteristics_grape        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_grapes       <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_grassy       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_gritty       <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_intense      <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_lasting      <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_lemon        <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_licoric      <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_melon        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_mild         <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_milk         <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_nut          <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_nutty        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_off          <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_olive        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_orange       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_pure         <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_red          <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_roasted      <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_rum          <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sandy        <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_smokey       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sour         <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_spice        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sticky       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_strong       <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sweet        <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_very         <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_woody        <dbl> 0.000000, 0.000000, …
## $ company_manufacturer_Beau.Cacao                   <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ company_manufacturer_other                        <dbl> 1, 1, 1, 1, 1, 1, 1,…
## $ company_location_Belgium                          <dbl> 0, 0, 1, 0, 0, 0, 0,…
## $ company_location_France                           <dbl> 0, 0, 0, 0, 0, 0, 1,…
## $ company_location_U.K.                             <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ company_location_U.S.A.                           <dbl> 0, 0, 0, 1, 0, 0, 0,…
## $ company_location_other                            <dbl> 1, 1, 0, 0, 1, 1, 0,…
## $ cocoa_percent_X100.                               <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X50.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X55.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X60.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X64.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X65.                                <dbl> 0, 0, 0, 0, 0, 1, 0,…
## $ cocoa_percent_X67.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X68.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X70.                                <dbl> 1, 1, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X72.                                <dbl> 0, 0, 1, 0, 0, 0, 0,…
## $ cocoa_percent_X73.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X74.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X75.                                <dbl> 0, 0, 0, 1, 0, 0, 1,…
## $ cocoa_percent_X80.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X81.                                <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X85.                                <dbl> 0, 0, 0, 0, 1, 0, 0,…
## $ ingredients_X1..B                                 <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ ingredients_X2..B.S                               <dbl> 1, 0, 0, 1, 0, 0, 0,…
## $ ingredients_X3..B.S.C                             <dbl> 0, 0, 0, 0, 0, 1, 0,…
## $ ingredients_X3..B.S..C                            <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ ingredients_X4..B.S.C.L                           <dbl> 0, 0, 0, 0, 0, 0, 1,…
## $ ingredients_X4..B.S.C.V                           <dbl> 0, 0, 0, 0, 1, 0, 0,…
## $ ingredients_X5..B.S.C.V.L                         <dbl> 0, 1, 1, 0, 0, 0, 0,…

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(50194)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)

## → A | warning: max_tokens was set to '100', but only 49 was available and selected.

## 
There were issues with some computations   A: x1

                                                 
→ B | warning: ! There are new levels in a factor: 80%, ! There are new levels in a factor: 3- B,S*,C
## There were issues with some computations   A: x1

There were issues with some computations   A: x1   B: x1

There were issues with some computations   A: x1   B: x2

There were issues with some computations   A: x1   B: x3

There were issues with some computations   A: x1   B: x4

                                                         
→ C | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x1   B: x4

                                                         
→ D | warning: max_tokens was set to '100', but only 50 was available and selected.
## There were issues with some computations   A: x1   B: x4

There were issues with some computations   A: x1   B: x5   C: x1   D: x1

                                                                         
→ E | warning: ! There are new levels in a factor: 81% and 100%, ! There are new levels in a factor: 1- B
## There were issues with some computations   A: x1   B: x5   C: x1   D: x1

There were issues with some computations   A: x1   B: x5   C: x1   D: x1   E: x1

There were issues with some computations   A: x1   B: x5   C: x1   D: x1   E: x2

There were issues with some computations   A: x1   B: x5   C: x1   D: x1   E: x3

There were issues with some computations   A: x1   B: x5   C: x1   D: x1   E: x4

There were issues with some computations   A: x1   B: x5   C: x1   D: x1   E: x5

There were issues with some computations   A: x2   B: x5   C: x2   D: x1   E: x5

                                                                                 
→ F | warning: ! There are new levels in a factor: 64%
## There were issues with some computations   A: x2   B: x5   C: x2   D: x1   E: x5

There were issues with some computations   A: x2   B: x5   C: x2   D: x1   E: x…

There were issues with some computations   A: x2   B: x5   C: x2   D: x1   E: x…

There were issues with some computations   A: x2   B: x5   C: x2   D: x1   E: x…

There were issues with some computations   A: x2   B: x5   C: x2   D: x1   E: x…

There were issues with some computations   A: x2   B: x5   C: x3   D: x1   E: x…

                                                                                 
→ G | warning: max_tokens was set to '100', but only 47 was available and selected.
## There were issues with some computations   A: x2   B: x5   C: x3   D: x1   E: x…

There were issues with some computations   A: x2   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

                                                                                 
→ H | warning: ! There are new levels in a factor: 60% and 55%
## There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x3   B: x5   C: x4   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x6   D: x1   E: x…

                                                                                 
→ I | warning: max_tokens was set to '100', but only 48 was available and selected.
## There were issues with some computations   A: x4   B: x5   C: x6   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x6   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x7   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x7   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

                                                                                 
→ J | warning: ! There are new levels in a factor: 50%
## There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

                                                                                 
→ K | warning: max_tokens was set to '100', but only 52 was available and selected.
## There were issues with some computations   A: x4   B: x5   C: x8   D: x1   E: x…

There were issues with some computations   A: x4   B: x5   C: x10   D: x1   E: …

There were issues with some computations   A: x4   B: x5   C: x10   D: x1   E: …

Apply 2

Spencer Murrin

2024-02-15

Import Data

Build Model