Our modeling goal is to predict ratings for board games based on the main characteristics like number of players and game category. How are the ratings distributed?

1 Explore Data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/ratings.csv")
## Rows: 21831 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, url, thumbnail
## dbl (7): num, id, year, rank, average, bayes_average, users_rated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
details <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/details.csv")
## Rows: 21631 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): primary, description, boardgamecategory, boardgamemechanic, boardg...
## dbl (13): num, id, yearpublished, minplayers, maxplayers, playingtime, minpl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Join the data

ratings_joined <- ratings %>%
    left_join(details, by = "id")

Create a binary target variable

# Is it top 25% in average rating?

ratings_cat <- ratings_joined %>% mutate(is_top_25 = if_else(average > 7.04, "yes", "no"))
skimr::skim(ratings_cat)
Data summary
Name ratings_cat
Number of rows 21831
Number of columns 33
_______________________
Column type frequency:
character 14
numeric 19
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1.00 1 107 0 21432 0
url 0 1.00 16 68 0 21831 0
thumbnail 6 1.00 135 139 0 21816 0
primary 200 0.99 1 107 0 21236 0
description 201 0.99 49 16144 0 21615 0
boardgamecategory 483 0.98 8 216 0 6730 0
boardgamemechanic 1790 0.92 8 478 0 8291 0
boardgamefamily 3961 0.82 13 2768 0 11285 0
boardgameexpansion 16325 0.25 7 18150 0 5264 0
boardgameimplementation 16969 0.22 6 890 0 4247 0
boardgamedesigner 796 0.96 7 332 0 9136 0
boardgameartist 6107 0.72 6 8408 0 9080 0
boardgamepublisher 201 0.99 6 3744 0 11265 0
is_top_25 0 1.00 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
num.x 0 1.00 10915.00 6302.21 0.00 5457.50 10915.00 16372.50 21830.00 ▇▇▇▇▇
id 0 1.00 118144.78 105369.55 1.00 12308.50 104994.00 207219.00 350992.00 ▇▂▃▃▂
year 0 1.00 1987.44 193.51 0.00 2001.00 2011.00 2017.00 3500.00 ▁▁▇▁▁
rank 0 1.00 10916.00 6302.21 1.00 5458.50 10916.00 16373.50 21831.00 ▇▇▇▇▇
average 0 1.00 6.42 0.93 1.04 5.83 6.45 7.04 9.57 ▁▁▅▇▁
bayes_average 0 1.00 5.68 0.36 0.00 5.51 5.54 5.67 8.51 ▁▁▁▇▁
users_rated 0 1.00 866.96 3679.82 30.00 56.00 122.00 392.00 108975.00 ▇▁▁▁▁
num.y 200 0.99 10815.00 6244.48 0.00 5407.50 10815.00 16222.50 21630.00 ▇▇▇▇▇
yearpublished 200 0.99 1986.09 210.04 -3500.00 2001.00 2011.00 2017.00 2023.00 ▁▁▁▁▇
minplayers 200 0.99 2.01 0.69 0.00 2.00 2.00 2.00 10.00 ▇▁▁▁▁
maxplayers 200 0.99 5.71 15.10 0.00 4.00 4.00 6.00 999.00 ▇▁▁▁▁
playingtime 200 0.99 90.51 534.83 0.00 25.00 45.00 90.00 60000.00 ▇▁▁▁▁
minplaytime 200 0.99 63.65 447.21 0.00 20.00 30.00 60.00 60000.00 ▇▁▁▁▁
maxplaytime 200 0.99 90.51 534.83 0.00 25.00 45.00 90.00 60000.00 ▇▁▁▁▁
minage 200 0.99 9.61 3.64 0.00 8.00 10.00 12.00 25.00 ▂▇▆▁▁
owned 200 0.99 1487.92 5395.08 0.00 150.00 322.00 903.50 168364.00 ▇▁▁▁▁
trading 200 0.99 43.59 102.41 0.00 5.00 13.00 38.00 2508.00 ▇▁▁▁▁
wanting 200 0.99 42.03 117.94 0.00 3.00 9.00 29.00 2011.00 ▇▁▁▁▁
wishing 200 0.99 233.66 800.66 0.00 14.00 39.00 131.00 19325.00 ▇▁▁▁▁
ratings_cat %>%
    ggplot(aes(is_top_25)) +
    geom_bar()

ratings_cat %>%
    filter(!is.na(minage)) %>%
    mutate(minage = cut_number(minage, 4)) %>%
    ggplot(aes(y = minage, fill = is_top_25)) +
    geom_bar(position = "fill")

2 Tune an xgboost model

xgboost would be a good fit for a large dataset like this. It throws a large sets of hyperparameters and see which one sticks. split

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.1
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.4     ✔ workflows    1.1.2
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.3     ✔ yardstick    1.1.0
## ✔ recipes      1.0.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
game_split <- ratings_cat %>%
    select(name, is_top_25, matches("min|max"), boardgamecategory) %>%
    na.omit() %>%
    initial_split(strata = is_top_25)

game_train <- training(game_split)
game_test  <- testing(game_split)

set.seed(234)
game_folds <- rsample::vfold_cv(game_train, strata = is_top_25)
game_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [14409/1601]> Fold01
##  2 <split [14409/1601]> Fold02
##  3 <split [14409/1601]> Fold03
##  4 <split [14409/1601]> Fold04
##  5 <split [14409/1601]> Fold05
##  6 <split [14409/1601]> Fold06
##  7 <split [14409/1601]> Fold07
##  8 <split [14409/1601]> Fold08
##  9 <split [14409/1601]> Fold09
## 10 <split [14409/1601]> Fold10

feature engineering

library(textrecipes)

split_category <- function(x) {
    
    x %>%
        str_split(", ") %>%
        # map(.x = ., .f = ~str_remove_all(.x, "[:punct:]")) %>%
        # map(~str_remove_all(.x, "[:punct:]")) %>%
        map(str_remove_all, "[:punct:]") %>%
        map(str_squish) %>% 
        map(str_to_lower) %>% 
        map(str_replace_all, " ", "_")
    
}

game_rec <- recipe(is_top_25 ~ ., data = game_train) %>%
    update_role(name, new_role = "id") %>%
    step_tokenize(boardgamecategory, custom_token = split_category) %>%
    step_tokenfilter(boardgamecategory, max_tokens = 30) %>%
    step_tf(boardgamecategory)

game_prep <- prep(game_rec)
bake(game_prep, new_data = game_train)
## # A tibble: 16,010 × 37
##    name   minpl…¹ maxpl…² minpl…³ maxpl…⁴ minage is_to…⁵ tf_bo…⁶ tf_bo…⁷ tf_bo…⁸
##    <fct>    <dbl>   <dbl>   <dbl>   <dbl>  <dbl> <fct>     <int>   <int>   <int>
##  1 Forbi…       2       4      30      30     10 no            0       0       1
##  2 Munch…       3       6      60     120     10 no            0       0       0
##  3 Bohna…       2       7      45      45     13 no            0       0       0
##  4 Coup         2       6      15      15     13 no            0       0       0
##  5 Sushi…       2       5      15      15      8 no            0       0       0
##  6 Risk         2       6     120     120     10 no            0       0       0
##  7 Monop…       2       8      60     180      8 no            0       0       0
##  8 Machi…       2       4      30      30     10 no            0       0       0
##  9 Scrab…       2       4      90      90     10 no            0       0       0
## 10 BANG!        4       7      20      40     10 no            0       0       0
## # … with 16,000 more rows, 27 more variables:
## #   tf_boardgamecategory_ancient <int>, tf_boardgamecategory_animals <int>,
## #   tf_boardgamecategory_bluffing <int>, tf_boardgamecategory_card_game <int>,
## #   tf_boardgamecategory_childrens_game <int>,
## #   tf_boardgamecategory_deduction <int>, tf_boardgamecategory_dice <int>,
## #   tf_boardgamecategory_economic <int>,
## #   tf_boardgamecategory_exploration <int>, …
library(usemodels)
use_xgboost(is_top_25 ~ ., data = game_train)

Model specification

xgb_spec <-
  boost_tree(
    trees = tune(),
    mtry = tune(),
    min_n = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

Workflow

xgb_wf <- workflow(game_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
##   learn_rate = 0.01
## 
## Computational engine: xgboost

Tune

library(finetune)
doParallel::registerDoParallel()

set.seed(99383)
# xgboost_tune <-
#     finetune::tune_race_anova(
#         object = xgboost_workflow, 
#         game_folds,
#         grid = 5,
#         control = control_race(verbose_elim = TRUE)
#         )

xgb_game_rs <-
    tune_grid(
        xgb_wf,
        game_folds,
        grid = 2,
        control = control_grid(pkgs = c("stringr", "purrr"))
    )
## i Creating pre-processing data to finalize unknown parameter: mtry

3 Evaluate models

3.a View the best model

collect_metrics(xgb_game_rs)
## # A tibble: 4 × 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1    10  1303    16 accuracy binary     0.808    10 0.00380 Preprocessor1_Model1
## 2    10  1303    16 roc_auc  binary     0.829    10 0.00560 Preprocessor1_Model1
## 3    31   637    30 accuracy binary     0.805    10 0.00364 Preprocessor1_Model2
## 4    31   637    30 roc_auc  binary     0.824    10 0.00593 Preprocessor1_Model2
# collect_predictions(xgb_game_rs)

# conf_mat_resampled(xgb_game_rs, tidy = FALSE) %>%
#     autoplot()

3.b Fit the best model to train data (finalize_workflow) evaluate on test data

xgb_last <- 
    xgb_wf %>% 
    finalize_workflow(select_best(xgb_game_rs, metric = "accuracy")) %>%
    last_fit(game_split)

3.d Collect metrics

xgb_last %>% collect_metrics()
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.800 Preprocessor1_Model1
## 2 roc_auc  binary         0.820 Preprocessor1_Model1

3.e Predict (extract_workflow)

xgb_last %>%
    extract_workflow() %>%
    predict(game_test[1,])
## # A tibble: 1 × 1
##   .pred_class
##   <fct>      
## 1 no

3.f Important variables

using the vip package (extract_fit_parsnip)

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_fit <- extract_fit_parsnip(xgb_last)
vip::vip(xgb_fit, geom = "point", num_features = 15)

using the SHAPforxgboost package

library(SHAPforxgboost)

game_shap <- 
    shap.prep(
        xgb_model = extract_fit_engine(xgb_fit),  
        X_train = bake(game_prep, 
                       has_role("predictor"), 
                       new_data = NULL, 
                       composition = "matrix")
        )

shap.plot.summary(game_shap)

shap.plot.dependence(
  game_shap,
  x = "minage",
  color_feature = "minplayers",
  size0 = 1.2,
  smooth = FALSE, add_hist = TRUE
)

CA Questions

  1. Question and Data

1.a What is the research question?

1.b Describe the data briefly.

1.c Describe the first observation (the first row) in the data.

  1. Data Exploration and Transformation

2.a What are the characteristics of the key variables used in the analysis? For example, which of the predictors seems to have correlations with the outcome variable? Does any variable have a skewed distribution, missing values, or incorrect information?

2.b Describe the differences between the original data and the data transformed for modeling. Why?

  1. Data Preparation and Modeling

3.a What are the names of data preparation steps (e.g., step_dummy) mentioned in the video?

3.b What is the name of the machine learning models (e.g., random forest) used in the analysis?

3.c Describe the steps taken for data preparation. Why?

3.d Describe the characteristics of the models used in the analysis, including their advantages and disadvantages.

  1. Model Evaluation

4.a What metrics are used in the model evaluation?

4.b Describe the characteristics of the metrics used in the analysis, including their advantages and disadvantages.

  1. Conclusion

5.a What are the major findings?