1. Introduction

This document will outline the instructions of how to use the the Extreme Gradient Boosting (XGBoost) model in predicting the margin of a sports match. XGBoost is a decision tree based machine learning algorithm that can be used to regression and classification problems.

In this example I will be applying the XGBoost model to calculate the margins for the 2023 AFL season.

2. Load packages

Load the following packages, xgboost, fiztzRoy and tidymodels, gtools

## load packages
library(xgboost)
library(fitzRoy)
library(tidymodels)
library(caret)
library(gtools)

3. Import, explore and manipulate Data

Using the FitzRoy package import the data. The FitzRoy package scrapes publicly available AFL data from sources like AFL Tables we will import the 2023 AFL season into a dataframe and call the afl_raw

The data is packaged up quite nicely ready to be used, however its good practice to explore the data.

You can look at the structure of the data, get summary stats to do some basic exploratory data analysis (EDA).

## import data using FitzRoy package
afl_raw <- fetch_player_stats_afltables(2023)

## look at dataframe structure
str(afl_raw)

## view dataframe summary
summary(afl_raw)

4. Manipulate and prepare data

You can manipulate and prepare the data based on what you want to use it for. For this example we will filter the data based on the final round of the 2023 regular season and select the required variables that will help us predict future match ups.

For this model we will keep it simple and use total number of kicks by each team this requires us to split the data between home team and away team total their kicks then combine them later.

We will also filter the 2023 AFL finals series to use later to make predictions on.

## get required variables and store in new dataframe
afl_23_home <- afl_raw %>%
  group_by(Round, Home.team, Playing.for) %>% # group by home team and playing for
  mutate(Home.Kicks = ifelse(Home.team == Playing.for, sum(Kicks), NA)) %>%
  filter(row_number() == 1) %>% 
  filter(!is.na(Home.Kicks)) %>% # filter NA results from if else to remove away teams
  ungroup() %>%
  select(Season, Round, Home.team, Home.score, Home.Kicks) #select variables
  
afl_23_away <- afl_raw %>%
  group_by(Round, Away.team, Playing.for) %>% # group by away team and playing for
  mutate(Away.Kicks = ifelse(Away.team == Playing.for, sum(Kicks), NA)) %>%
  filter(row_number() == 1) %>% 
  filter(!is.na(Away.Kicks)) %>% # filter NA results from if else to remove away teams
  ungroup() %>%
  select(Season, Round, Away.team, Away.score, Away.Kicks) #select variables

## combine home and away dfs
afl_23 <- afl_23_home %>% 
  cbind(afl_23_away) %>% 
  select(c(1, 2, 3, 4, 5, , 8, 9, 10)) %>% 
  as_tibble() %>% 
  mutate(Margin = Home.score - Away.score) 

## create AFL regular season dataset
  afl_23_regular <- afl_23 %>%
  filter(!Round %in% c("QF",
                        "EF",
                        "SF",
                        "PF",
                        "GF")) 


## create AFL finals dataset
  afl_23_finals <- afl_23 %>%
  filter(Round %in% c("QF",
                        "EF",
                        "SF",
                        "PF",
                        "GF"))

5. Build XGBoost model

Build an initial XGboost model, and evalute its performance, we will further tune this model.

## create a xgBoost spec
xg_spec <- boost_tree() %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

## fit xgBoost model to afl 23  data
xg_fit <- xg_spec %>% 
  fit(Margin ~ Home.Kicks + Away.Kicks , data = afl_23_regular)

Evaluate model performance

## create dataframe with initial model results
 predicted <- augment(xg_fit, new_data = afl_23_regular) %>%
   select(Round, Home.team, Home.score, Away.team, Away.score, Margin, .pred, .resid)
 
## RMSE and R2 for initial model
  RMSE_initial <- RMSE(predicted$.pred, predicted$Margin)
  R2_intial <- R2(predicted$.pred, predicted$Margin)

7. Hyperparameter tuning and update model

We will now perform hyperparameter tuning on the intial XGBoost model. This process is essential and allows us to build a better performing model.

To do this we will split the data between train and test then resample the train data using cross validation, we will also tune the models node before splitting, tree depth and learn rate.

## split data
afl_split <- initial_split(afl_23_regular)
afl_train <- training(afl_split) #train
afl_test <- testing(afl_split) #test

## Hypeparameter tune
## create a new random forest spec for tuning
xg_spec2 <- boost_tree(min_n = tune(),
                       learn_rate = tune(), 
                       tree_depth = tune()) %>%
            set_engine("xgboost") %>%
            set_mode("regression")


## create a new workflow
xg_workflow <- workflow()%>%
  add_model(xg_spec2) %>%
  add_formula(Margin ~ Home.Kicks + Away.Kicks)

## create grid to store parameters
grid <- grid_regular(tree_depth(),
             min_n(),
             learn_rate(),
             levels = 5)

## create a cross validation set on train
afl_folds <- vfold_cv(afl_train)

## fit resamples to tune grid
xgb_res <-
  tune_grid(
    xg_workflow,
    afl_folds,
    grid
  )

## show best tuning metrics
show_best(xgb_res, "rmse")
## # A tibble: 5 × 9
##   min_n tree_depth learn_rate .metric .estimator  mean     n std_err .config    
##   <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
## 1     5          2     0.221  rmse    standard    32.8    10    2.04 Preprocess…
## 2    12          7     0.158  rmse    standard    32.9    10    2.31 Preprocess…
## 3     8         11     0.0614 rmse    standard    33.0    10    2.39 Preprocess…
## 4    30         13     0.0516 rmse    standard    34.8    10    2.88 Preprocess…
## 5    20          5     0.0247 rmse    standard    35.9    10    3.08 Preprocess…
best_rmse <- select_best(xgb_res, "rmse")
best_rmse
## # A tibble: 1 × 4
##   min_n tree_depth learn_rate .config              
##   <int>      <int>      <dbl> <chr>                
## 1     5          2      0.221 Preprocessor1_Model03
## finalise workflow tune model using best rmse
final_xgb <- finalize_workflow(
  xg_workflow,
  best_rmse
)

final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Formula
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## Margin ~ Home.Kicks + Away.Kicks
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   min_n = 5
##   tree_depth = 2
##   learn_rate = 0.221259373580151
## 
## Computational engine: xgboost

Fit newly tuned model to train and test.

## fit tuned model to train
xg_fit2 <- 
  final_xgb %>% 
  fit(data = afl_train)

## fit tuned model to test
xg_fit2 <- 
  final_xgb %>% 
  fit(data = afl_test)

8. Make predictions

We will now make margin predictions on the 2023 AFL finals series.

First we will create a dataframe that stores all possible match ups of the 2023 finals series.

## create a new df for all possible finals match ups
finals_matchups <- 
  permutations(n = 8, r = 2, c("Carlton",
                        "Melbourne",
                        "Sydney",
                        "Collingwood",
                        "St Kilda",
                        "Brisbane Lions",
                        "GWS",
                        "Port Adelaide"), 
                             repeats.allowed = FALSE) %>% 
  as.data.frame()

We will now add the teams margin predictions to the finals match ups.

## join matchups with afl
finals_matchups2 <- finals_matchups %>%
  rename(Home.team = V1,
         Away.team = V2) %>%
  right_join(afl_23_finals, by = c("Home.team","Away.team"))


## add win probabilities to matchups
 matchups_xg <- 
  augment(xg_fit2, new_data = finals_matchups2) %>% 
  select(Home.team, Away.team, .pred) %>%
  mutate_if(is.numeric, round, 0)

Finally we combine the predicted margins to the AFL finals to give us the margins predictions for the finals matches. A positive predicted margin means a win for the home team a Negative predicted margin suggests a win in favour of the away team.

afl_23_finals2 <- afl_23_finals %>%
 left_join(matchups_xg, by = c("Home.team", "Away.team")) %>%
  select(-c("Margin", "Home.score", "Away.score", "Home.Kicks", "Away.Kicks")) %>%
  rename(Predicted.Margin = .pred)