library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/ratings.csv")

## Rows: 21831 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, url, thumbnail
## dbl (7): num, id, year, rank, average, bayes_average, users_rated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

details <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/details.csv")

## Rows: 21631 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): primary, description, boardgamecategory, boardgamemechanic, boardg...
## dbl (13): num, id, yearpublished, minplayers, maxplayers, playingtime, minpl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ratings_joined <-
  ratings %>%
  left_join(details, by = "id")

ggplot(ratings_joined, aes(average)) +
  geom_histogram(alpha = 0.8)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ratings_joined %>%
  filter(!is.na(minage)) %>%
  mutate(minage = cut_number(minage, 4)) %>%
  ggplot(aes(minage, average, fill = minage)) +
  geom_boxplot(alpha = 0.2, show.legend = FALSE)

Tune an xgboost model

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

set.seed(123)
game_split <-
  ratings_joined %>%
  select(name, average, matches("min|max"), boardgamecategory) %>%
  na.omit() %>%
  initial_split(strata = average)
game_train <- training(game_split)
game_test <- testing(game_split)

set.seed(234)
game_folds <- vfold_cv(game_train, strata = average)
game_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [14407/1602]> Fold01
##  2 <split [14407/1602]> Fold02
##  3 <split [14407/1602]> Fold03
##  4 <split [14408/1601]> Fold04
##  5 <split [14408/1601]> Fold05
##  6 <split [14408/1601]> Fold06
##  7 <split [14408/1601]> Fold07
##  8 <split [14408/1601]> Fold08
##  9 <split [14410/1599]> Fold09
## 10 <split [14410/1599]> Fold10

library(textrecipes)

split_category <- function(x) {
  x %>%
    str_split(", ") %>%
    map(str_remove_all, "[:punct:]") %>%
    map(str_squish) %>%
    map(str_to_lower) %>%
    map(str_replace_all, " ", "_")
}

game_rec <-
  recipe(average ~ ., data = game_train) %>%
  update_role(name, new_role = "id") %>%
  step_tokenize(boardgamecategory, custom_token = split_category) %>%
  step_tokenfilter(boardgamecategory, max_tokens = 30) %>%
  step_tf(boardgamecategory)

game_prep <- prep(game_rec)
bake(game_prep, new_data = NULL) %>% str()

## tibble [16,009 × 37] (S3: tbl_df/tbl/data.frame)
##  $ name                                      : Factor w/ 15781 levels "¡Adiós Calavera!",..: 10857 8587 14642 858 15729 6819 13313 1490 3143 9933 ...
##  $ minplayers                                : num [1:16009] 2 2 2 4 2 1 2 2 4 2 ...
##  $ maxplayers                                : num [1:16009] 6 8 10 10 6 8 6 2 16 6 ...
##  $ minplaytime                               : num [1:16009] 120 60 30 30 60 20 60 30 60 45 ...
##  $ maxplaytime                               : num [1:16009] 120 180 30 30 90 20 60 30 60 45 ...
##  $ minage                                    : num [1:16009] 10 8 6 12 15 6 8 8 13 8 ...
##  $ average                                   : num [1:16009] 5.59 4.37 5.41 5.79 5.8 5.62 4.31 4.66 5.68 5.14 ...
##  $ tf_boardgamecategory_abstract_strategy    : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_action_dexterity     : int [1:16009] 0 0 0 0 0 1 0 0 0 0 ...
##  $ tf_boardgamecategory_adventure            : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_ancient              : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_animals              : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_bluffing             : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_card_game            : int [1:16009] 0 0 1 1 0 0 0 0 0 1 ...
##  $ tf_boardgamecategory_childrens_game       : int [1:16009] 0 0 0 0 0 0 1 1 0 0 ...
##  $ tf_boardgamecategory_deduction            : int [1:16009] 0 0 0 0 0 0 0 1 0 0 ...
##  $ tf_boardgamecategory_dice                 : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_economic             : int [1:16009] 0 1 0 0 0 0 1 0 0 0 ...
##  $ tf_boardgamecategory_exploration          : int [1:16009] 0 0 0 0 1 0 0 0 0 0 ...
##  $ tf_boardgamecategory_fantasy              : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_fighting             : int [1:16009] 0 0 0 0 1 0 0 0 0 0 ...
##  $ tf_boardgamecategory_horror               : int [1:16009] 0 0 0 0 1 0 0 0 0 0 ...
##  $ tf_boardgamecategory_humor                : int [1:16009] 0 0 0 1 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_medieval             : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_miniatures           : int [1:16009] 0 0 0 0 1 0 0 0 0 0 ...
##  $ tf_boardgamecategory_movies_tv_radio_theme: int [1:16009] 0 0 1 0 1 0 0 0 0 0 ...
##  $ tf_boardgamecategory_nautical             : int [1:16009] 0 0 0 0 0 0 0 1 0 0 ...
##  $ tf_boardgamecategory_negotiation          : int [1:16009] 0 1 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_party_game           : int [1:16009] 0 0 0 1 0 1 0 0 1 0 ...
##  $ tf_boardgamecategory_print_play           : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_puzzle               : int [1:16009] 0 0 0 0 0 0 0 0 1 0 ...
##  $ tf_boardgamecategory_racing               : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_realtime             : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_science_fiction      : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_boardgamecategory_trivia               : int [1:16009] 0 0 0 0 0 0 0 0 1 0 ...
##  $ tf_boardgamecategory_wargame              : int [1:16009] 1 0 0 0 0 0 0 1 0 0 ...
##  $ tf_boardgamecategory_world_war_ii         : int [1:16009] 0 0 0 0 0 0 0 0 0 0 ...

xgb_spec <-
  boost_tree(
    trees = tune(),
    mtry = tune(),
    min_n = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgb_wf <- workflow(game_rec, xgb_spec)
xgb_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
##   learn_rate = 0.01
## 
## Computational engine: xgboost

library(finetune)
doParallel::registerDoParallel()

set.seed(234)
xgb_game_rs <-
  tune_race_anova(
    xgb_wf,
    game_folds,
    grid = 20,
    control = control_race(verbose_elim = TRUE)
  )

## i Creating pre-processing data to finalize unknown parameter: mtry

## ℹ Racing will minimize the rmse metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 6 eliminated; 14 candidates remain.
## 
## ℹ Fold06: 8 eliminated; 6 candidates remain.
## 
## ℹ Fold08: 2 eliminated; 4 candidates remain.
## 
## ℹ Fold01: 1 eliminated; 3 candidates remain.
## 
## ℹ Fold04: All but one parameter combination were eliminated.

xgb_game_rs

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits               id     .order .metrics          .notes          
##    <list>               <chr>   <int> <list>            <list>          
##  1 <split [14407/1602]> Fold03      1 <tibble [40 × 7]> <tibble [0 × 3]>
##  2 <split [14408/1601]> Fold05      2 <tibble [40 × 7]> <tibble [0 × 3]>
##  3 <split [14410/1599]> Fold10      3 <tibble [40 × 7]> <tibble [0 × 3]>
##  4 <split [14408/1601]> Fold06      4 <tibble [28 × 7]> <tibble [0 × 3]>
##  5 <split [14408/1601]> Fold08      5 <tibble [12 × 7]> <tibble [0 × 3]>
##  6 <split [14407/1602]> Fold01      6 <tibble [8 × 7]>  <tibble [0 × 3]>
##  7 <split [14408/1601]> Fold04      7 <tibble [6 × 7]>  <tibble [0 × 3]>
##  8 <split [14407/1602]> Fold02      8 <tibble [2 × 7]>  <tibble [0 × 3]>
##  9 <split [14408/1601]> Fold07     10 <tibble [2 × 7]>  <tibble [0 × 3]>
## 10 <split [14410/1599]> Fold09      9 <tibble [2 × 7]>  <tibble [0 × 3]>

plot_race(xgb_game_rs)

show_best(xgb_game_rs)

## Warning: No value of `metric` was given; metric 'rmse' will be used.

## # A tibble: 1 × 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    14  1709    17 rmse    standard   0.735    10 0.00550 Preprocessor1_Model08

xgb_last <-
  xgb_wf %>%
  finalize_workflow(select_best(xgb_game_rs, "rmse")) %>%
  last_fit(game_split)

xgb_last

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits               id              .metrics .notes   .predictions .workflow 
##   <list>               <chr>           <list>   <list>   <list>       <list>    
## 1 <split [16009/5339]> train/test spl… <tibble> <tibble> <tibble>     <workflow>

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgb_fit <- extract_fit_parsnip(xgb_last)
vip(xgb_fit, geom = "point", num_features = 12)

library(SHAPforxgboost)

game_shap <-
  shap.prep(
    xgb_model = extract_fit_engine(xgb_fit),
    X_train = bake(game_prep,
      has_role("predictor"),
      new_data = NULL,
      composition = "matrix"
    )
  )

shap.plot.summary(game_shap)

shap.plot.dependence(
  game_shap,
  x = "minage",
  color_feature = "minplayers",
  size0 = 1.2,
  smooth = FALSE, add_hist = TRUE
)

1. What is the research question?

The question we are trying to answer is if we can predict the rating for a board game based on characteristics such as min or max players and category. # Describe the data briefly
The data is board game focused and has to do with ratings, number of players and characteristics of the game. # What are the characteristics of the key variables used in the analysis?
The primary variables we are looking at are rating of the game, the characteristics of the game such as what actions occur during the game and number of players that can play the game at a given time.

#2. Data Exploration and Transformation: - The newly transformed data includes a combination of the details data and the ratings data. Also further along in the prep of the data, we elected a few key variables instead of all the possible observations. These changes allowed us to look at all data as a whole as well as cut off any unnecessary data when we trimmed it down.

3. Data Preparation and Modeling:

Some of the data prep steps used in this code along include step_tokenize(creates a specification of a recipe step that will convert a character to a token variable), step_tokenfilter(creates a specification of a recipe step that will convert a token variable to be filtered by frequency), and step_tf(converts token variable into multiple variables). Among these we also used a custom_token for the first time. We also used xgboost(this is a scalable distributed GBDT. Provides parallel tree boosting).

#4. Model Evaluation: - We used plot_race to see how all of the parameter combinations were doing during the analysis. This showed us the effectiveness of not evaluating all of the combinations, only the ones that were showing promise.

5. Conclusion:

The key findings from the analysis of the dataset are that maximum playing time and minimum age have the most influence on the predicted rating a game is to receive.

Board Games

Colin Tracy

2023-09-26

Tune an xgboost model

1. What is the research question?

3. Data Preparation and Modeling:

5. Conclusion: