chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
| Name | chocolate |
| Number of rows | 2530 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| company_manufacturer | 0 | 1.00 | 2 | 39 | 0 | 580 | 0 |
| company_location | 0 | 1.00 | 4 | 21 | 0 | 67 | 0 |
| country_of_bean_origin | 0 | 1.00 | 4 | 21 | 0 | 62 | 0 |
| specific_bean_origin_or_bar_name | 0 | 1.00 | 3 | 51 | 0 | 1605 | 0 |
| cocoa_percent | 0 | 1.00 | 3 | 6 | 0 | 46 | 0 |
| ingredients | 87 | 0.97 | 4 | 14 | 0 | 21 | 0 |
| most_memorable_characteristics | 0 | 1.00 | 3 | 37 | 0 | 2487 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ref | 0 | 1 | 1429.80 | 757.65 | 5 | 802 | 1454.00 | 2079.0 | 2712 | ▆▇▇▇▇ |
| review_date | 0 | 1 | 2014.37 | 3.97 | 2006 | 2012 | 2015.00 | 2018.0 | 2021 | ▃▅▇▆▅ |
| rating | 0 | 1 | 3.20 | 0.45 | 1 | 3 | 3.25 | 3.5 | 4 | ▁▁▅▇▇ |
data <- chocolate %>%
separate_rows(most_memorable_characteristics) %>%
# Treat missing values
select(-specific_bean_origin_or_bar_name) %>%
na.omit() %>%
# Log Transform Variables with pos-skewed Distribution
mutate(rating = log(rating))
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-review_date) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 8,403
## Columns: 103
## $ `ref__-Inf_833` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__833_1482 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__1482_2122 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ref__2122_Inf <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_manufacturer__A._Morin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Arete <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Bonnat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Pralus <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Colombia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Spain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A. <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ country_of_bean_origin__Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Haiti <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ country_of_bean_origin__Mexico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Trinidad <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__66%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ most_memorable_characteristics__coffee <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__creamy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__floral <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spice <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__vanilla <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER` <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 1, …
## $ `rating__-Inf_1.09861228866811` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165 <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ rating__1.17865499634165_1.25276296849537 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ rating__1.25276296849537_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(rating__1.25276296849537_Inf)
data_corr_tbl
## # A tibble: 103 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 rating 1.25276296849537_Inf 1
## 2 rating -Inf_1.09861228866811 -0.386
## 3 rating 1.17865499634165_1.25276296849537 -0.239
## 4 rating 1.09861228866811_1.17865499634165 -0.213
## 5 company_manufacturer -OTHER -0.144
## 6 company_manufacturer Soma 0.122
## 7 most_memorable_characteristics creamy 0.110
## 8 company_manufacturer Bonnat 0.0941
## 9 cocoa_percent 67% 0.0798
## 10 ingredients -OTHER -0.0640
## # ℹ 93 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 92 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split Data
#data <- sample_n(data, 100)
# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross-validation
set.seed(12345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5671/631]> Fold01
## 2 <split [5671/631]> Fold02
## 3 <split [5672/630]> Fold03
## 4 <split [5672/630]> Fold04
## 5 <split [5672/630]> Fold05
## 6 <split [5672/630]> Fold06
## 7 <split [5672/630]> Fold07
## 8 <split [5672/630]> Fold08
## 9 <split [5672/630]> Fold09
## 10 <split [5672/630]> Fold10
library(usemodels)
usemodels::use_xgboost(rating ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = rating ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(98210)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = rating ~ ., data = data_train) %>%
recipes::update_role(ref, new_role = "id") %>%
step_other(company_manufacturer, company_location, country_of_bean_origin, most_memorable_characteristics, threshold = 0.02) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 6,302
## Columns: 108
## $ ref <dbl> 1796, 2190, 967, 2182, 1828,…
## $ review_date <dbl> 2016, 2018, 2012, 2018, 2016…
## $ rating <dbl> 1.0116009, 1.1786550, 1.2527…
## $ company_manufacturer_Soma <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ company_manufacturer_other <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1…
## $ company_location_Australia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Belgium <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Canada <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ company_location_Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_France <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_Italy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.K. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ company_location_U.S.A. <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 1, 0…
## $ company_location_other <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 0, 1…
## $ country_of_bean_origin_Belize <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Blend <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Bolivia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Colombia <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ country_of_bean_origin_Dominican.Republic <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ country_of_bean_origin_Guatemala <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Madagascar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Mexico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Nicaragua <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Papua.New.Guinea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Peru <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Tanzania <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_Venezuela <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ country_of_bean_origin_Vietnam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ country_of_bean_origin_other <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ cocoa_percent_X100. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X42. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X46. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X50. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X53. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X55. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X56. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X57. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X58. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60.5. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X60. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X61. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X62. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X63. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X64. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X65. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X66. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X67. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X68. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ cocoa_percent_X69. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X70. <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0…
## $ cocoa_percent_X71.50. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X71. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72.5. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X72. <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ cocoa_percent_X73.5. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X73. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X74. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X75. <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ cocoa_percent_X76. <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ cocoa_percent_X77. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X78. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X79. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X80. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X81. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X82. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X83. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X84. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X85. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X86. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X87. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X88. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X89. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X90. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X91. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cocoa_percent_X99. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X1..B <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X2..B.S <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 1…
## $ ingredients_X2..B.S. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.C <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 0, 0…
## $ ingredients_X3..B.S.L <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S.V <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X3..B.S..Sa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.L <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.Sa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.C.V <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S.V.L <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.L <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.Sa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..C.V <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X4..B.S..V.L <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.L.Sa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5..B.S.C.V.L <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ ingredients_X5.B.S.C.V.Sa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ingredients_X6.B.S.C.V.L.Sa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_cocoa <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_creamy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_earthy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_fruit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_mild <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_nutty <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ most_memorable_characteristics_roasty <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sour <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_sweet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ most_memorable_characteristics_other <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1…
xgboost_spec <-
boost_tree(trees = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(50194)
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## → A | warning: ! There are new levels in a factor: 53%
##
There were issues with some computations A: x1
→ B | warning: ! There are new levels in a factor: 60.5%
## There were issues with some computations A: x1
There were issues with some computations A: x1 B: x1
→ C | warning: ! There are new levels in a factor: 99%
## There were issues with some computations A: x1 B: x1
There were issues with some computations A: x1 B: x1 C: x1
→ D | warning: ! There are new levels in a factor: 87%
## There were issues with some computations A: x1 B: x1 C: x1
There were issues with some computations A: x1 B: x1 C: x1 D: x1
There were issues with some computations A: x1 B: x1 C: x1 D: x1
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 7
## trees .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1294 rmse standard 0.0803 10 0.00172 Preprocessor1_Model4
## 2 1818 rmse standard 0.0803 10 0.00172 Preprocessor1_Model5
## 3 1175 rmse standard 0.0804 10 0.00172 Preprocessor1_Model3
## 4 594 rmse standard 0.0817 10 0.00164 Preprocessor1_Model2
## 5 65 rmse standard 0.108 10 0.00101 Preprocessor1_Model1
# Update the model by selecting the best hyper-parameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire traing data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.0715 Preprocessor1_Model1
## 2 rsq standard 0.756 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(rating, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()
After looking over the data set and trying a few possibilities, I ended up taking out the code separate rows for specific bean origin. I didn’t see a reason to separate these rows and it brought down the number of rows in my data set by over ten thousand. Unfortunately by making this change my RMSE increased slightly. My RMSE increased from .0487 to .0715 but RSQ decreased from .871 to .756. After looking at both data sets my first chart looked better.