chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
| Name | chocolate |
| Number of rows | 2530 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| company_manufacturer | 0 | 1.00 | 2 | 39 | 0 | 580 | 0 |
| company_location | 0 | 1.00 | 4 | 21 | 0 | 67 | 0 |
| country_of_bean_origin | 0 | 1.00 | 4 | 21 | 0 | 62 | 0 |
| specific_bean_origin_or_bar_name | 0 | 1.00 | 3 | 51 | 0 | 1605 | 0 |
| cocoa_percent | 0 | 1.00 | 3 | 6 | 0 | 46 | 0 |
| ingredients | 87 | 0.97 | 4 | 14 | 0 | 21 | 0 |
| most_memorable_characteristics | 0 | 1.00 | 3 | 37 | 0 | 2487 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ref | 0 | 1 | 1429.80 | 757.65 | 5 | 802 | 1454.00 | 2079.0 | 2712 | ▆▇▇▇▇ |
| review_date | 0 | 1 | 2014.37 | 3.97 | 2006 | 2012 | 2015.00 | 2018.0 | 2021 | ▃▅▇▆▅ |
| rating | 0 | 1 | 3.20 | 0.45 | 1 | 3 | 3.25 | 3.5 | 4 | ▁▁▅▇▇ |
data <- chocolate %>%
separate_rows(most_memorable_characteristics) %>%
separate_rows(specific_bean_origin_or_bar_name) %>%
# Treat missing values
select(-specific_bean_origin_or_bar_name, -ref) %>%
na.omit() %>%
# Log Transform Variables with pos-skewed Distribution
mutate(rating = log(rating))
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-review_date) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 23,520
## Columns: 95
## $ company_manufacturer__Bonnat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Castronovo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Dandelion <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A. <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Jamaica <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Mexico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Philippines <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ country_of_bean_origin__Trinidad <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, …
## $ most_memorable_characteristics__creamy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__dried <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ most_memorable_characteristics__floral <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__tart <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `rating__-Inf_1.09861228866811` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rating__1.17865499634165_1.25276296849537 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.25276296849537_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(rating__1.25276296849537_Inf)
data_corr_tbl
## # A tibble: 95 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 rating 1.25276296849537_Inf 1
## 2 rating -Inf_1.09861228866811 -0.384
## 3 rating 1.17865499634165_1.25276296849537 -0.270
## 4 rating 1.09861228866811_1.17865499634165 -0.237
## 5 company_manufacturer -OTHER -0.153
## 6 company_manufacturer Soma 0.149
## 7 most_memorable_characteristics creamy 0.104
## 8 company_manufacturer Bonnat 0.0903
## 9 cocoa_percent 67% 0.0897
## 10 company_location Canada 0.0808
## # ℹ 85 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 85 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split Data
data <- sample_n(data, 100)
# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross-validation
set.seed(12345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
usemodels::use_xgboost(rating ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = rating ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(39943)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = rating ~ ., data = data_train) %>%
recipes::update_role(country_of_bean_origin, new_role = "id") %>%
step_tokenize(most_memorable_characteristics) %>%
step_tokenfilter(most_memorable_characteristics, max_tokens = 100) %>%
step_tfidf(most_memorable_characteristics) %>%
step_other(company_manufacturer, company_location) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_YeoJohnson(review_date)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Warning: max_tokens was set to '100', but only 53 was available and selected.
## Rows: 75
## Columns: 86
## $ review_date <dbl> 2012, 2018, 2009, 20…
## $ country_of_bean_origin <fct> Dominican Republic, …
## $ rating <dbl> 1.178655, 1.252763, …
## $ tfidf_most_memorable_characteristics_anise <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_astringent <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_balanced <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_banana <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_basic <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_berry <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_butterscotch <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_cherry <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_chewy <dbl> 4.330733, 0.000000, …
## $ tfidf_most_memorable_characteristics_classic <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_coarse <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_cocoa <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_creamy <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_dark <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_delicate <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_dried <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_dry <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_earthy <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_fatty <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_few <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_floral <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_fruit <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_full <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_gateway <dbl> 0.000000, 4.330733, …
## $ tfidf_most_memorable_characteristics_grape <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_grapes <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_grassy <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_gritty <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_intense <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_lasting <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_lemon <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_licoric <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_melon <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_mild <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_milk <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_nut <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_nutty <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_off <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_olive <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_orange <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_pure <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_red <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_roasted <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_rum <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sandy <dbl> 0.000000, 0.000000, …
## $ tfidf_most_memorable_characteristics_smokey <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sour <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_spice <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sticky <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_strong <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_sweet <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_very <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_most_memorable_characteristics_woody <dbl> 0.000000, 0.000000, …
## $ company_manufacturer_Beau.Cacao <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ company_manufacturer_other <dbl> 1, 1, 1, 1, 1, 1, 1,…
## $ company_location_Belgium <dbl> 0, 0, 1, 0, 0, 0, 0,…
## $ company_location_France <dbl> 0, 0, 0, 0, 0, 0, 1,…
## $ company_location_U.K. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ company_location_U.S.A. <dbl> 0, 0, 0, 1, 0, 0, 0,…
## $ company_location_other <dbl> 1, 1, 0, 0, 1, 1, 0,…
## $ cocoa_percent_X100. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X50. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X55. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X60. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X64. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X65. <dbl> 0, 0, 0, 0, 0, 1, 0,…
## $ cocoa_percent_X67. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X68. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X70. <dbl> 1, 1, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X72. <dbl> 0, 0, 1, 0, 0, 0, 0,…
## $ cocoa_percent_X73. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X74. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X75. <dbl> 0, 0, 0, 1, 0, 0, 1,…
## $ cocoa_percent_X80. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X81. <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ cocoa_percent_X85. <dbl> 0, 0, 0, 0, 1, 0, 0,…
## $ ingredients_X1..B <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ ingredients_X2..B.S <dbl> 1, 0, 0, 1, 0, 0, 0,…
## $ ingredients_X3..B.S.C <dbl> 0, 0, 0, 0, 0, 1, 0,…
## $ ingredients_X3..B.S..C <dbl> 0, 0, 0, 0, 0, 0, 0,…
## $ ingredients_X4..B.S.C.L <dbl> 0, 0, 0, 0, 0, 0, 1,…
## $ ingredients_X4..B.S.C.V <dbl> 0, 0, 0, 0, 1, 0, 0,…
## $ ingredients_X5..B.S.C.V.L <dbl> 0, 1, 1, 0, 0, 0, 0,…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(50194)
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## → A | warning: max_tokens was set to '100', but only 49 was available and selected.
##
There were issues with some computations A: x1
→ B | warning: ! There are new levels in a factor: 80%, ! There are new levels in a factor: 3- B,S*,C
## There were issues with some computations A: x1
There were issues with some computations A: x1 B: x1
There were issues with some computations A: x1 B: x2
There were issues with some computations A: x1 B: x3
There were issues with some computations A: x1 B: x4
→ C | warning: A correlation computation is required, but `estimate` is constant and has 0
## standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations A: x1 B: x4
→ D | warning: max_tokens was set to '100', but only 50 was available and selected.
## There were issues with some computations A: x1 B: x4
There were issues with some computations A: x1 B: x5 C: x1 D: x1
→ E | warning: ! There are new levels in a factor: 81% and 100%, ! There are new levels in a factor: 1- B
## There were issues with some computations A: x1 B: x5 C: x1 D: x1
There were issues with some computations A: x1 B: x5 C: x1 D: x1 E: x1
There were issues with some computations A: x1 B: x5 C: x1 D: x1 E: x2
There were issues with some computations A: x1 B: x5 C: x1 D: x1 E: x3
There were issues with some computations A: x1 B: x5 C: x1 D: x1 E: x4
There were issues with some computations A: x1 B: x5 C: x1 D: x1 E: x5
There were issues with some computations A: x2 B: x5 C: x2 D: x1 E: x5
→ F | warning: ! There are new levels in a factor: 64%
## There were issues with some computations A: x2 B: x5 C: x2 D: x1 E: x5
There were issues with some computations A: x2 B: x5 C: x2 D: x1 E: x…
There were issues with some computations A: x2 B: x5 C: x2 D: x1 E: x…
There were issues with some computations A: x2 B: x5 C: x2 D: x1 E: x…
There were issues with some computations A: x2 B: x5 C: x2 D: x1 E: x…
There were issues with some computations A: x2 B: x5 C: x3 D: x1 E: x…
→ G | warning: max_tokens was set to '100', but only 47 was available and selected.
## There were issues with some computations A: x2 B: x5 C: x3 D: x1 E: x…
There were issues with some computations A: x2 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
→ H | warning: ! There are new levels in a factor: 60% and 55%
## There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x3 B: x5 C: x4 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x6 D: x1 E: x…
→ I | warning: max_tokens was set to '100', but only 48 was available and selected.
## There were issues with some computations A: x4 B: x5 C: x6 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x6 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x7 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x7 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
→ J | warning: ! There are new levels in a factor: 50%
## There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
→ K | warning: max_tokens was set to '100', but only 52 was available and selected.
## There were issues with some computations A: x4 B: x5 C: x8 D: x1 E: x…
There were issues with some computations A: x4 B: x5 C: x10 D: x1 E: …
There were issues with some computations A: x4 B: x5 C: x10 D: x1 E: …