library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/ratings.csv")
## Rows: 21831 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, url, thumbnail
## dbl (7): num, id, year, rank, average, bayes_average, users_rated
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
details <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-25/details.csv")
## Rows: 21631 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): primary, description, boardgamecategory, boardgamemechanic, boardg...
## dbl (13): num, id, yearpublished, minplayers, maxplayers, playingtime, minpl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ratings_joined <-
ratings %>%
left_join(details, by = "id")
ggplot(ratings_joined, aes(average)) +
geom_histogram(alpha = 0.8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ratings_joined %>%
filter(!is.na(minage)) %>%
mutate(minage = cut(minage, 4)) %>%
ggplot(aes(minage, average, fill = minage)) +
geom_boxplot(alpha = 0.2, show.legend = FALSE)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
game_split <-
ratings_joined %>%
select(name, average, matches("min|max"), boardgamecategory) %>%
na.omit() %>%
initial_split(strata = average)
game_train <- training(game_split)
game_test <- testing(game_split)
set.seed(234)
game_folds <- vfold_cv(game_train, strata = average)
game_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [14407/1602]> Fold01
## 2 <split [14407/1602]> Fold02
## 3 <split [14407/1602]> Fold03
## 4 <split [14408/1601]> Fold04
## 5 <split [14408/1601]> Fold05
## 6 <split [14408/1601]> Fold06
## 7 <split [14408/1601]> Fold07
## 8 <split [14408/1601]> Fold08
## 9 <split [14410/1599]> Fold09
## 10 <split [14410/1599]> Fold10
library(textrecipes)
split_category <- function(x) {
x %>%
str_split(", ") %>%
map(str_remove_all, "[:punct:]") %>%
map(str_squish) %>%
map(str_to_lower) %>%
map(str_replace_all, " ", "_")
}
game_rec <-
recipe(average ~ ., data = game_train) %>%
update_role(name, new_role = "id") %>%
step_tokenize(boardgamecategory, custom_token = split_category) %>%
step_tokenfilter(boardgamecategory, max_tokens = 30) %>%
step_tf(boardgamecategory)
game_prep <- prep(game_rec)
bake(game_prep, new_data = NULL)
## # A tibble: 16,009 × 37
## name minplayers maxplayers minplaytime maxplaytime minage average
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Risk 2 6 120 120 10 5.59
## 2 Monopoly 2 8 60 180 8 4.37
## 3 UNO 2 10 30 30 6 5.41
## 4 Apples to Apples 4 10 30 30 12 5.79
## 5 Zombies!!! 2 6 60 90 15 5.8
## 6 Jenga 1 8 20 20 6 5.62
## 7 The Game of Life 2 6 60 60 8 4.31
## 8 Battleship 2 2 30 30 8 4.66
## 9 Cranium 4 16 60 60 13 5.68
## 10 Phase 10 2 6 45 45 8 5.14
## # ℹ 15,999 more rows
## # ℹ 30 more variables: tf_boardgamecategory_abstract_strategy <int>,
## # tf_boardgamecategory_action_dexterity <int>,
## # tf_boardgamecategory_adventure <int>, tf_boardgamecategory_ancient <int>,
## # tf_boardgamecategory_animals <int>, tf_boardgamecategory_bluffing <int>,
## # tf_boardgamecategory_card_game <int>,
## # tf_boardgamecategory_childrens_game <int>, …
xgb_spec <-
boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("regression")
xgb_wf <- workflow(game_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
## learn_rate = 0.01
##
## Computational engine: xgboost
library(finetune)
doParallel::registerDoParallel()
set.seed(234)
xgb_game_rs <-
tune_race_anova(
xgb_wf,
game_folds,
grid = 20,
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the rmse metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 6 eliminated; 14 candidates remain.
##
## ℹ Fold06: 8 eliminated; 6 candidates remain.
##
## ℹ Fold08: 2 eliminated; 4 candidates remain.
##
## ℹ Fold01: 1 eliminated; 3 candidates remain.
##
## ℹ Fold04: All but one parameter combination were eliminated.
xgb_game_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [14407/1602]> Fold03 1 <tibble [40 × 7]> <tibble [0 × 3]>
## 2 <split [14408/1601]> Fold05 2 <tibble [40 × 7]> <tibble [0 × 3]>
## 3 <split [14410/1599]> Fold10 3 <tibble [40 × 7]> <tibble [0 × 3]>
## 4 <split [14408/1601]> Fold06 4 <tibble [28 × 7]> <tibble [0 × 3]>
## 5 <split [14408/1601]> Fold08 5 <tibble [12 × 7]> <tibble [0 × 3]>
## 6 <split [14407/1602]> Fold01 6 <tibble [8 × 7]> <tibble [0 × 3]>
## 7 <split [14408/1601]> Fold04 7 <tibble [6 × 7]> <tibble [0 × 3]>
## 8 <split [14407/1602]> Fold02 8 <tibble [2 × 7]> <tibble [0 × 3]>
## 9 <split [14408/1601]> Fold07 10 <tibble [2 × 7]> <tibble [0 × 3]>
## 10 <split [14410/1599]> Fold09 9 <tibble [2 × 7]> <tibble [0 × 3]>
plot_race(xgb_game_rs)
show_best(xgb_game_rs)
## Warning: No value of `metric` was given; metric 'rmse' will be used.
## # A tibble: 1 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 14 1709 17 rmse standard 0.735 10 0.00550 Preprocessor1_Model08
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_game_rs, "rmse")) %>%
last_fit(game_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [16009/5339]> train/test spl… <tibble> <tibble> <tibble> <workflow>
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_fit <- extract_workflow(xgb_last)
vip(xgb_fit, geom = "point", num_features = 12)
library(SHAPforxgboost)
game_shap <-
shap.prep(
xgb_model = extract_fit_engine(xgb_fit),
X_train = bake(game_prep,
has_role("predictor"),
new_data = NULL,
composition = "matrix"
)
)
shap.plot.summary(game_shap)
shap.plot.dependence(
game_shap,
x = "minage",
color_feature = "minplayers",
size0 = 1.2,
smooth = FALSE, add_hist = TRUE
)
What is the research question? Clearly state the research question you aim to address using the new dataset. Can we predict ratings for board games based on the main characteristics like number of players and game category? How are the ratings distributed?
Describe the data briefly: Provide an overview of the new data set, highlighting its key characteristics and dimensions. The data sets are all about board games and board game ratings. Various information about board games are included in the data sets. The dimensions of the ratings.csv data set are 21,831 rows and 10 columns. The dimensions of the details.csv data set is 21,631 rows and 23 columns. The key variables are “name” which describes the name of the board games. “average” is the average rating of the board game. “year” is the year the board game was released. “boardgamecategory” is a description of the type of category that the boardgame falls into. “minplayers” and “maxplayers” are also key variables that provide insight on how many minimum players and maximum players are needed to play each game.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. “average” is numerical data and was used as the target variable in the XGBoost model. “name” is character data and is important because the name of the board game is essential for referencing the games. “year” is numeric data and could be used to look at trends over time such as if old games rank better or worse than new games. “boardgamecategory” is character data and was used to determine if the board game category had any connection to the rating of the board game. “minplayers” and “maxplayers” are numerical data, and these variables tell us how inclusive or exclusive a game is which could influence ratings.
What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. A preparation step used on the code is na.omit() which we used to remove rows with missing values. We also used as.integer to convert the “year” column to integer data. This way the data will be treated as numeric not character. We also filtered the data to only include rows where the year was greater than or equal to 2000 using the subset function. This way we were using more recent records. We also used (x-min(x))/(max(x)-min(x)) to bring the values between 0 and 1. Value_squared() and value_log() were used based on the “value” column. The squared coulmn captures non-linear relationships and the log-transformed column can look at skewness in the data. Lastly, we used head() to select only the first 1,000 rows of the data set which made the data set more manageable.
What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model used in the analysis is XGBoost. We used set_engine(xgboost) with set_mode(regression) to make a regression model that predicts board game ratings.