Explore data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/ratings.csv")
## Rows: 21831 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, url, thumbnail
## dbl (7): num, id, year, rank, average, bayes_average, users_rated
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
details <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/details.csv")
## Rows: 21631 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): primary, description, boardgamecategory, boardgamemechanic, boardg...
## dbl (13): num, id, yearpublished, minplayers, maxplayers, playingtime, minpl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ratings_joined <-
ratings %>%
left_join(details, by = "id")
ratings_joined <- sample_n(ratings_joined, 100)
ggplot(ratings_joined, aes(average)) +
geom_histogram(alpha = 0.8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ratings_joined %>%
filter(!is.na(minage)) %>%
mutate(minage = cut_number(minage, 4)) %>%
ggplot(aes(minage, average, fill = minage)) +
geom_boxplot(alpha = .2, show.legend = FALSE)

Tune on xgboost model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
game_split <-
ratings_joined %>%
select(name, average, matches("min|max"), boardgamecategory) %>%
na.omit() %>%
initial_split(strata = average)
game_train <- training(game_split)
game_test <- testing(game_split)
set.seed(234)
game_folds <- vfold_cv(game_train, strata = average)
game_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [64/9]> Fold01
## 2 <split [64/9]> Fold02
## 3 <split [64/9]> Fold03
## 4 <split [65/8]> Fold04
## 5 <split [66/7]> Fold05
## 6 <split [66/7]> Fold06
## 7 <split [67/6]> Fold07
## 8 <split [67/6]> Fold08
## 9 <split [67/6]> Fold09
## 10 <split [67/6]> Fold10
library(textrecipes)
library(stringi)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
split_category <- function(x) {
x %>%
str_split(", ") %>%
map(str_remove_all, "[:punct:]") %>%
map(str_squish) %>%
map(str_to_lower) %>%
map(str_replace_all, " ", "_")
}
game_rec <-
recipe(average ~ ., data = game_train) %>%
update_role(name, new_role = "id") %>%
step_tokenize(boardgamecategory, custom_token = split_category) %>%
step_tokenfilter(boardgamecategory, max_tokens = 30) %>%
step_tf(boardgamecategory)
game_prep <-
prep(game_rec)
bake(game_prep, new_data = NULL)
## # A tibble: 73 × 37
## name minplayers maxplayers minplaytime maxplaytime minage average
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Get Nifty 2 6 40 40 10 4.72
## 2 Bible Challenge 2 16 90 90 10 5.09
## 3 Kangaroo 3 6 10 10 5 5.7
## 4 Kingz 2 5 15 15 7 4.77
## 5 City 2 5 60 60 12 6.02
## 6 Yahtzee Flash 1 4 10 10 8 3.99
## 7 Covert Action 4 14 20 20 10 5.17
## 8 Monster Factory 2 6 30 30 5 5.99
## 9 Saqqara 3 5 60 60 12 5.15
## 10 Napoleon's Firs… 2 2 180 180 12 5.75
## # ℹ 63 more rows
## # ℹ 30 more variables: tf_boardgamecategory_abstract_strategy <int>,
## # tf_boardgamecategory_action_dexterity <int>,
## # tf_boardgamecategory_adventure <int>, tf_boardgamecategory_ancient <int>,
## # tf_boardgamecategory_animals <int>, tf_boardgamecategory_bluffing <int>,
## # tf_boardgamecategory_card_game <int>,
## # tf_boardgamecategory_childrens_game <int>, …
xgb_spec <-
boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("regression")
xgb_wf <- workflow(game_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
## learn_rate = 0.01
##
## Computational engine: xgboost
library(finetune)
doParallel::registerDoParallel()
set.seed(234)
xgb_game_rs <-
tune_grid(
xgb_wf,
game_folds,
grid = 5,
control = control_race(verbose_elim = TRUE, pkgs = "tidyverse")
)
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_game_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [64/9]> Fold01 <tibble [10 × 7]> <tibble [1 × 3]>
## 2 <split [64/9]> Fold02 <tibble [10 × 7]> <tibble [1 × 3]>
## 3 <split [64/9]> Fold03 <tibble [10 × 7]> <tibble [1 × 3]>
## 4 <split [65/8]> Fold04 <tibble [10 × 7]> <tibble [1 × 3]>
## 5 <split [66/7]> Fold05 <tibble [10 × 7]> <tibble [1 × 3]>
## 6 <split [66/7]> Fold06 <tibble [10 × 7]> <tibble [1 × 3]>
## 7 <split [67/6]> Fold07 <tibble [10 × 7]> <tibble [1 × 3]>
## 8 <split [67/6]> Fold08 <tibble [10 × 7]> <tibble [1 × 3]>
## 9 <split [67/6]> Fold09 <tibble [10 × 7]> <tibble [1 × 3]>
## 10 <split [67/6]> Fold10 <tibble [10 × 7]> <tibble [1 × 3]>
##
## There were issues with some computations:
##
## - Warning(s) x10: A correlation computation is required, but `estimate` is constant...
##
## Run `show_notes(.Last.tune.result)` for more information.
Evaluate Models
show_best(xgb_game_rs)
## # A tibble: 5 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 14 1057 8 rmse standard 0.866 10 0.0569 Preprocessor1_Model2
## 2 7 1954 31 rmse standard 0.872 10 0.0645 Preprocessor1_Model1
## 3 20 659 21 rmse standard 0.872 10 0.0651 Preprocessor1_Model3
## 4 32 1236 10 rmse standard 0.892 10 0.0451 Preprocessor1_Model5
## 5 24 382 36 rmse standard 0.907 10 0.0692 Preprocessor1_Model4
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_game_rs, "rmse")) %>%
last_fit(game_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [73/27]> train/test split <tibble> <tibble> <tibble> <workflow>
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_fit <- extract_fit_parsnip(xgb_last)
vip(xgb_fit, geom = "point", num_features = 12)

library(SHAPforxgboost)
game_shap <-
shap.prep(
xgb_model = extract_fit_engine(xgb_fit),
X_train = bake(game_prep,
has_role("predictor"),
new_data = NULL,
composition = "matrix"
)
)
shap.plot.summary(game_shap)

shap.plot.dependence(
game_shap,
x = "minage",
color_feature = "minplayers",
size0 = 1.2,
smooth = FALSE, add_hist = TRUE
)
