Explore data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/ratings.csv")
## Rows: 21831 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, url, thumbnail
## dbl (7): num, id, year, rank, average, bayes_average, users_rated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
details <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/details.csv")
## Rows: 21631 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): primary, description, boardgamecategory, boardgamemechanic, boardg...
## dbl (13): num, id, yearpublished, minplayers, maxplayers, playingtime, minpl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ratings_joined <-
  ratings %>%
  left_join(details, by = "id")
ratings_joined <- sample_n(ratings_joined, 100)

ggplot(ratings_joined, aes(average)) +
  geom_histogram(alpha = 0.8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ratings_joined %>%
    filter(!is.na(minage)) %>% 
    mutate(minage = cut_number(minage, 4)) %>%
    ggplot(aes(minage, average, fill = minage)) +
    geom_boxplot(alpha = .2, show.legend = FALSE) 

Tune on xgboost model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
game_split <-
    ratings_joined %>%
    select(name, average, matches("min|max"), boardgamecategory) %>%
    na.omit() %>%
    initial_split(strata = average)

game_train <- training(game_split)

game_test <- testing(game_split)

set.seed(234)
game_folds <- vfold_cv(game_train, strata = average)
game_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [64/9]> Fold01
##  2 <split [64/9]> Fold02
##  3 <split [64/9]> Fold03
##  4 <split [65/8]> Fold04
##  5 <split [66/7]> Fold05
##  6 <split [66/7]> Fold06
##  7 <split [67/6]> Fold07
##  8 <split [67/6]> Fold08
##  9 <split [67/6]> Fold09
## 10 <split [67/6]> Fold10
library(textrecipes)
library(stringi)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
split_category <- function(x) {
    x %>%
        str_split(", ") %>%
        map(str_remove_all, "[:punct:]") %>%
        map(str_squish) %>%
        map(str_to_lower) %>%
        map(str_replace_all, " ", "_")
}

game_rec <-
    recipe(average ~ ., data = game_train) %>%
    update_role(name, new_role = "id") %>%
    step_tokenize(boardgamecategory, custom_token = split_category) %>%
    step_tokenfilter(boardgamecategory, max_tokens = 30) %>%
    step_tf(boardgamecategory)

game_prep <-
    prep(game_rec)
bake(game_prep, new_data = NULL)
## # A tibble: 73 × 37
##    name             minplayers maxplayers minplaytime maxplaytime minage average
##    <fct>                 <dbl>      <dbl>       <dbl>       <dbl>  <dbl>   <dbl>
##  1 Get Nifty                 2          6          40          40     10    4.72
##  2 Bible Challenge           2         16          90          90     10    5.09
##  3 Kangaroo                  3          6          10          10      5    5.7 
##  4 Kingz                     2          5          15          15      7    4.77
##  5 City                      2          5          60          60     12    6.02
##  6 Yahtzee Flash             1          4          10          10      8    3.99
##  7 Covert Action             4         14          20          20     10    5.17
##  8 Monster Factory           2          6          30          30      5    5.99
##  9 Saqqara                   3          5          60          60     12    5.15
## 10 Napoleon's Firs…          2          2         180         180     12    5.75
## # ℹ 63 more rows
## # ℹ 30 more variables: tf_boardgamecategory_abstract_strategy <int>,
## #   tf_boardgamecategory_action_dexterity <int>,
## #   tf_boardgamecategory_adventure <int>, tf_boardgamecategory_ancient <int>,
## #   tf_boardgamecategory_animals <int>, tf_boardgamecategory_bluffing <int>,
## #   tf_boardgamecategory_card_game <int>,
## #   tf_boardgamecategory_childrens_game <int>, …
xgb_spec <-
  boost_tree(
    trees = tune(),
    mtry = tune(),
    min_n = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgb_wf <- workflow(game_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
##   learn_rate = 0.01
## 
## Computational engine: xgboost
library(finetune)

doParallel::registerDoParallel()

set.seed(234)
xgb_game_rs <-
  tune_grid(
    xgb_wf,
    game_folds,
    grid = 5,
    control = control_race(verbose_elim = TRUE, pkgs = "tidyverse")
    )
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_game_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 4
##    splits         id     .metrics          .notes          
##    <list>         <chr>  <list>            <list>          
##  1 <split [64/9]> Fold01 <tibble [10 × 7]> <tibble [1 × 3]>
##  2 <split [64/9]> Fold02 <tibble [10 × 7]> <tibble [1 × 3]>
##  3 <split [64/9]> Fold03 <tibble [10 × 7]> <tibble [1 × 3]>
##  4 <split [65/8]> Fold04 <tibble [10 × 7]> <tibble [1 × 3]>
##  5 <split [66/7]> Fold05 <tibble [10 × 7]> <tibble [1 × 3]>
##  6 <split [66/7]> Fold06 <tibble [10 × 7]> <tibble [1 × 3]>
##  7 <split [67/6]> Fold07 <tibble [10 × 7]> <tibble [1 × 3]>
##  8 <split [67/6]> Fold08 <tibble [10 × 7]> <tibble [1 × 3]>
##  9 <split [67/6]> Fold09 <tibble [10 × 7]> <tibble [1 × 3]>
## 10 <split [67/6]> Fold10 <tibble [10 × 7]> <tibble [1 × 3]>
## 
## There were issues with some computations:
## 
##   - Warning(s) x10: A correlation computation is required, but `estimate` is constant...
## 
## Run `show_notes(.Last.tune.result)` for more information.

Evaluate Models

show_best(xgb_game_rs)
## # A tibble: 5 × 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1    14  1057     8 rmse    standard   0.866    10  0.0569 Preprocessor1_Model2
## 2     7  1954    31 rmse    standard   0.872    10  0.0645 Preprocessor1_Model1
## 3    20   659    21 rmse    standard   0.872    10  0.0651 Preprocessor1_Model3
## 4    32  1236    10 rmse    standard   0.892    10  0.0451 Preprocessor1_Model5
## 5    24   382    36 rmse    standard   0.907    10  0.0692 Preprocessor1_Model4
xgb_last <-
  xgb_wf %>%
  finalize_workflow(select_best(xgb_game_rs, "rmse")) %>%
  last_fit(game_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits          id               .metrics .notes   .predictions .workflow 
##   <list>          <chr>            <list>   <list>   <list>       <list>    
## 1 <split [73/27]> train/test split <tibble> <tibble> <tibble>     <workflow>
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_fit <- extract_fit_parsnip(xgb_last)

vip(xgb_fit, geom = "point", num_features = 12)

library(SHAPforxgboost)

game_shap <-
  shap.prep(
    xgb_model = extract_fit_engine(xgb_fit),
    X_train = bake(game_prep,
      has_role("predictor"),
      new_data = NULL,
      composition = "matrix"
    )
  )

shap.plot.summary(game_shap)

shap.plot.dependence(
  game_shap,
  x = "minage",
  color_feature = "minplayers",
  size0 = 1.2,
  smooth = FALSE, add_hist = TRUE
)