Data: Football-data.co.uk

Get the dataset by directly reading, via read.csv, the url that contains the csv files for the relevant dates. A function has been created to include more dates.

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

fbdata_base_url <- "https://www.football-data.co.uk/mmz4281/%s/E0.csv"

## DOWNLOAD EPL DATA FUNCTION 
download_epl_data <- function(years){
  combined_df <- data.frame()
  
  for (year in years){
    url <- sprintf(fbdata_base_url, year)
    year_data <- read.csv(url)
    combined_df <- rbind(year_data)
    
    Sys.sleep(3)
  }
  return(combined_df)
}

years <- c('2223', '2324') ## get two seasons of data
combined_23_24 <- download_epl_data(years)
head(colnames(combined_23_24))

## [1] "Div"      "Date"     "Time"     "HomeTeam" "AwayTeam" "FTHG"

match_data <- combined_23_24 %>% select(Date, HomeTeam, AwayTeam, FTHG, FTAG, FTR)
## check
head(match_data)

##         Date         HomeTeam       AwayTeam FTHG FTAG FTR
## 1 11/08/2023          Burnley       Man City    0    3   A
## 2 12/08/2023          Arsenal  Nott'm Forest    2    1   H
## 3 12/08/2023      Bournemouth       West Ham    1    1   D
## 4 12/08/2023         Brighton          Luton    4    1   H
## 5 12/08/2023          Everton         Fulham    0    1   A
## 6 12/08/2023 Sheffield United Crystal Palace    0    1   A

Basic analysis: Home and Away win rates

## Best team at home 
home_team_win_rate  <- match_data %>% group_by(HomeTeam) %>% summarise(home_win_rate = mean(FTR == 'H'))
away_team_win_rate <- match_data %>% group_by(AwayTeam) %>% summarise(away_win_rate = mean(FTR == 'A'))

home_away_win_rates <-home_team_win_rate %>% left_join(away_team_win_rate, 
                                join_by('HomeTeam' == 'AwayTeam')) %>% rename(Team = HomeTeam) %>% 
  mutate('home_adv' = home_win_rate - away_win_rate) %>% 
  arrange(desc(home_win_rate)) 

home_away_win_rates%>% 
  ggplot(aes(x = reorder(Team, -home_win_rate))) + 
  geom_point(aes(y = home_win_rate, colour = 'Home Win Rate')) +
  geom_point(aes(y = away_win_rate, colour = 'Away Win Rate')) +
geom_segment(aes(x = Team, xend = Team, y = home_win_rate, yend = away_win_rate), 
               color = "grey", linetype = "dotted", size = 1) +
  labs(title = 'Home and Away win rates EPL teams', 
       x = 'Teams', y = 'Win Rates', color = 'Home/away')+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Man City is the most consistent team home and away in winning, and that could be the reason for them being the defending champions. Liverpool and Arsenal have more variance, and perform less well in away games. For them to contest Man City for the title, it could be imperative for them to win their away games more often.

## Data preparation: convert team names to numeric indices
match_data$HomeTeamId <- as.integer(factor(match_data$HomeTeam))
match_data$AwayTeamId <- as.integer(factor(match_data$AwayTeam))
## verify 
match_data %>% filter(HomeTeam == 'Liverpool') %>% pull(HomeTeamId) %>% unique()

## [1] 11

match_data %>% filter(AwayTeam == 'Liverpool') %>% pull(AwayTeamId) %>% unique()

## [1] 11

## Finding the main outcome variable -- goal difference 
match_data['GoalDiff'] <- match_data$FTHG - match_data$FTAG

Looking at goal difference

## Visualise 
ggplot(match_data) + geom_bar(aes(x = GoalDiff), 
                              fill = 'lightblue') + 
  scale_x_continuous(breaks = seq(min(match_data$GoalDiff),
                                  max(match_data$GoalDiff),
                                  by = 1)) +
  labs(title = 'Distribution of goal difference (Home - Away)')+
  theme_bw()

Home minus Away goals

Interestingly, goal difference is relatively normal looking, although the drop off is more steep when moving left of 0 than right.

1. Terry Bradley Model : win/loss, score not taken into account

A popular method for estimating team/player strengths from analysing pairwise datapoints (i.e. matches between two players). It assumes that each comparison is an independent datapoint, which is obviously contentious in most sporting scenarios.

Link to R Blogpost

unique_teams <- sort(unique(c(combined_23_24$Home, 
                              combined_23_24$Away)))
## According to the blog, we need to get a specific data matrix
# such that for each row of the game data, we have a vector of n_teams
# where the element of the home team is 1 and that of away team is -1
# else its 0

## get dataframe for bradley terry model
get_data_vec <- function(home, away, unique_teams){
  ## init the vec -- matrix with indicator variable for the teams if theyre playing each other 
  vec <- rep(0, length(unique_teams))
  
  ## home teams: 1, away teams: -1
  vec[unique_teams == home] <- 1
  vec[unique_teams == away] <- -1 
  return(vec)
} ## this function applies to a single row of data and creates a corresponding row -- hence we need to iteratively apply across all rows of the data

X <- apply(match_data, 1,## 1 for row wise applying  
           function(row) get_data_vec(row['HomeTeam'],
                                      row['AwayTeam'], 
                                      unique_teams))
## check
dim(X)

## [1]  20 380

X <- t(X) ## transpose such that every row now is a match
colnames(X) <- unique_teams
## Create Y = binary indicator for home team winning 
y <- match_data$GoalDiff > 1
y <- as.integer(y)
## Recall that we have a lot of instances for draws. We can try
# considering draws as a loss for the home team, since they are supposed to have 
# home advantage, hence all else equal it is a disappointing result for them

## prepare df 
bt_df <- as.data.frame(cbind(X,y))
bt_model <- glm(y ~ ., data = bt_df, family = 'binomial')

## See results!
bt_results_df <- data.frame('coeff' = bt_model$coefficients)
bt_results_df <- bt_results_df %>% arrange(desc(coeff))
bt_results_df['Team'] <- rownames(bt_results_df); rownames(bt_results_df) <- NULL
bt_results_df <- bt_results_df %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>% 
  mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(coeff = ifelse(is.na(coeff), 0, coeff))

library(ggrepel)
bt_results_df %>%
  ggplot(aes(x = reorder(Team, -coeff))) +  # Reorder by coefficient descending
  geom_segment(aes(x = Team, xend = Team, y = 0, yend = coeff), 
               color = "grey", linetype = "dotted") +  # Vertical lines from 0 to coefficient
  geom_point(aes(y = coeff), color = "blue", size = 3) +  # Highlight points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +  # Rotate team names
  labs(title = "Team Strength Coefficients from Bradley-Terry Model",
       x = "Teams",
       y = "Coefficient (Team Strength)")  # Add informative labels

Compare results of BT model with base win rates for teams

bt_results_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>% 
  ggplot(aes(x = win_rate, y = coeff)) + 
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'Win Rate', y = 'BT Model Coefficients', 
       title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
  theme_minimal()

## Joining with `by = join_by(Team)`

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).

There is a general agreement between the win rates and the BT Model coefficients.

The difference is that win rates is agnostic, whereas the BT model takes into account the strength of your opponents.

What if we remove the intercept?

## prepare df 
bt_model_no_incpt <- glm(y ~ . -1, data = bt_df, family = 'binomial')

## See results!
bt_results_df_no_intcpt <- data.frame('coeff' = bt_model_no_incpt$coefficients)
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% arrange(desc(coeff))
bt_results_df_no_intcpt['Team'] <- rownames(bt_results_df_no_intcpt); rownames(bt_results_df_no_intcpt) <- NULL
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>% 
  mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(coeff = ifelse(is.na(coeff), 0, coeff))

bt_results_df_no_intcpt %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>% 
  ggplot(aes(x = win_rate, y = coeff)) + 
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'Win Rate', y = 'BT Model Coefficients', 
       title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
  theme_minimal()

## Joining with `by = join_by(Team)`

Manually solve for BT model - via Maximum Likelihood Estimation

## Create map and team indices 
teams_index_map <- setNames(seq_along(unique_teams), unique_teams)
match_data <-match_data %>% mutate(
  WinnerIdx = ifelse(FTR == 'H', teams_index_map[HomeTeam], 
                                         teams_index_map[AwayTeam]), 
  LoserIdx = ifelse(FTR == 'A', teams_index_map[HomeTeam], 
                                         teams_index_map[AwayTeam]))
## Create negative log likelihood function 
neg_log_likelihood_func <- function(theta_rest, data) {
  theta <- c(theta_rest, 0) ## set last team's theta to 0
  winnerIdx <- data$WinnerIdx
  loserIdx <- data$LoserIdx
  lin_pred <- theta[winnerIdx] - theta[loserIdx]
  prob <- (1+exp(-lin_pred))^-1 ##logistic formula 
  log_prob <- log(prob)
  total_log_lik =  sum(log_prob, na.rm = T)## iteratively add to the total log likelihood

  return(-total_log_lik)
}


## Init the thetas that are going to be updated 
init_thetas <- rep(0, length(teams_index_map) -1)

## OPTIMISE!
manual_BT_results <- optim(par = init_thetas, fn = neg_log_likelihood_func, data = match_data, method = 'L-BFGS')
manual_BT_coeffs <- manual_BT_results$par
manual_BT_df <- data.frame(Team = names(teams_index_map), 
                           bt_manual_coeffs = c(manual_BT_coeffs, 0)) %>% arrange(desc(bt_manual_coeffs))
manual_BT_df

##                Team bt_manual_coeffs
## 1          Man City       2.45031181
## 2           Arsenal       2.04181355
## 3         Liverpool       1.97081368
## 4       Aston Villa       1.12632319
## 5         Tottenham       0.77543920
## 6           Chelsea       0.69566489
## 7         Newcastle       0.64269216
## 8        Man United       0.52776517
## 9          West Ham       0.29222555
## 10         Brighton       0.29082672
## 11   Crystal Palace       0.20154869
## 12          Everton       0.09064414
## 13      Bournemouth       0.07688813
## 14           Fulham       0.07016092
## 15           Wolves       0.00000000
## 16    Nott'm Forest      -0.42440288
## 17        Brentford      -0.54727382
## 18            Luton      -1.27204220
## 19          Burnley      -1.50094394
## 20 Sheffield United      -2.25301390

manual_BT_df %>%
  ggplot(aes(x = reorder(Team, -bt_manual_coeffs))) +  # Reorder by coefficient descending
  geom_segment(aes(x = Team, xend = Team, y = 0, yend = bt_manual_coeffs), 
               color = "grey", linetype = "dotted") +  # Vertical lines from 0 to coefficient
  geom_point(aes(y = bt_manual_coeffs), color = "blue", size = 3) +  # Highlight points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +  # Rotate team names
  labs(title = "Team Strength Coefficients from MLE (BT Model)",
       x = "Teams",
       y = "Coefficient (Team Strength)")  # Add informative labels

This is an improvement from the previous method that uses GLM, as this prevents teams from having the same results. This also naturally excludes the home advantage parameter/ does not factor in home advantage.

Compare results of BT model (MANUAL MLE) with base win rates for teams

manual_BT_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, bt_manual_coeffs, win_rate) %>% 
  ggplot(aes(x = win_rate, y = bt_manual_coeffs)) + 
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'Win Rate', y = 'BT Model Coefficients', 
       title = 'Relationship between win base rate and the Terry Bradley coefficients (No intercept)', subtitle = '2022-2024' ) +
  theme_minimal()

## Joining with `by = join_by(Team)`

## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

The agreement is now better. In particular, the model correctly spots that Arsenal has performed slightly better than Liverpool.

Bayesian Modelling

Model the goal difference:

\[ Y_i = \text{Home Score}_i - \text{Away Score}_i \] \[ Y_i \sim \mathcal{N} \left( (\alpha_{h_i} - \alpha_{a_i}) + \theta, \sigma^2 \right) \]

Team strengths (home and away) are drawn from the same distribution:

\[ \alpha_j \sim \mathcal{N}(\mu_{\alpha}, \sigma_{\alpha}^2) \]

Home advantage is modeled as a fixed effect:

\[ \theta \sim \mathcal{N}(0, \sigma_{\theta}^2) \]

Prior for the mean of team strengths:

\[ \mu_{\alpha} \sim \mathcal{N}(0, 1) \]

Prior for the variance of team strengths:

\[ \sigma_{\alpha}^2 \sim \text{Half-Cauchy}(0, 1) \]

Prior for the variance of the error term:

\[ \sigma^2 \sim \text{Inverse-Gamma}(2, 2) \]

Full likelihood of the observed goal difference:

\[ Y_i \sim \mathcal{N} \left( (\alpha_{h_i} - \alpha_{a_i}) + \theta, \sigma^2 \right) \]

For the Bayesian Modelling, we try the package BRMS.

library(brms) ## package for bayesian modelling

## Loading required package: Rcpp

## Loading 'brms' package (version 2.22.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').

## 
## Attaching package: 'brms'

## The following object is masked from 'package:stats':
## 
##     ar

## fitting the bayesian model 
bayesian_fit <- brm(
  formula = GoalDiff ~ 1 +  (1|HomeTeam) + (1| AwayTeam), 
  data = match_data, 
  family = gaussian(), 
  prior = c(
    prior(normal(0, 1), class = "Intercept"),    # Prior for the home advantage (intercept)
    prior(normal(0, 1), class = "sd"),           # Prior for mu_alpha (mean of team strengths)
    prior(cauchy(0, 1), class = "sd", group = "HomeTeam"),  # Prior for sigma_alpha (variance of team strengths)
    prior(cauchy(0, 1), class = "sd", group = "AwayTeam"),  # Prior for sigma_alpha for away teams
    prior(inv_gamma(2, 2), class = "sigma")      # Prior for observation error variance (sigma^2)
  ),
  iter = 2000, 
chains = 4)

## Compiling Stan program...

## Trying to compile a simple C file

## Running /usr/lib/R/bin/R CMD SHLIB foo.c
## using C compiler: ‘gcc (Ubuntu 13.2.0-23ubuntu4) 13.2.0’
## gcc -I"/usr/share/R/include" -DNDEBUG   -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/Rcpp/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/unsupported"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/BH/include" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/src/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppParallel/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1       -fpic  -g -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -ffile-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=. -fstack-protector-strong -fstack-clash-protection -Wformat -Werror=format-security -fcf-protection -fdebug-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=/usr/src/r-base-4.3.3-2build2 -Wdate-time -D_FORTIFY_SOURCE=3  -c foo.c -o foo.o
## In file included from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Core:19,
##                  from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Dense:1,
##                  from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
##                  from <command-line>:
## /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
##   679 | #include <cmath>
##       |          ^~~~~~~
## compilation terminated.
## make: *** [/usr/lib/R/etc/Makeconf:191: foo.o] Error 1

## Start sampling

## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 4.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.46 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.328 seconds (Warm-up)
## Chain 1:                0.246 seconds (Sampling)
## Chain 1:                0.574 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 3.5e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.35 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.315 seconds (Warm-up)
## Chain 2:                0.247 seconds (Sampling)
## Chain 2:                0.562 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 2.1e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.21 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.322 seconds (Warm-up)
## Chain 3:                0.246 seconds (Sampling)
## Chain 3:                0.568 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 2.7e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.27 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.31 seconds (Warm-up)
## Chain 4:                0.247 seconds (Sampling)
## Chain 4:                0.557 seconds (Total)
## Chain 4:

summary(bayesian_fit)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: GoalDiff ~ 1 + (1 | HomeTeam) + (1 | AwayTeam) 
##    Data: match_data (Number of observations: 380) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Multilevel Hyperparameters:
## ~AwayTeam (Number of levels: 20) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.67      0.16     0.40     1.03 1.00     1529     2116
## 
## ~HomeTeam (Number of levels: 20) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.89      0.18     0.59     1.31 1.00     1690     2240
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     0.30      0.26    -0.22     0.80 1.00     1731     2390
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     1.77      0.07     1.65     1.91 1.00     6422     2950
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(bayesian_fit)

away_team_results <- data.frame(ranef(bayesian_fit)$AwayTeam) %>% rename(AwayTeamStr = Estimate.Intercept, 
                                                                         AwayTeamStrError = Est.Error.Intercept) %>% 
  mutate(AwayTeamStr = -AwayTeamStr)
home_team_results <- data.frame(ranef(bayesian_fit)$HomeTeam) %>% rename(HomeTeamStr = Estimate.Intercept,
                                                                         HomeTeamStrError = Est.Error.Intercept)
team_results <- cbind(home_team_results, away_team_results) %>% select(HomeTeamStr, AwayTeamStr)
brms_team_results <- team_results %>% arrange(desc(HomeTeamStr), desc(AwayTeamStr)) %>% mutate(AvgTeamStr = 0.5*(HomeTeamStr+AwayTeamStr))

Compare to previous results and compare to win rate

brms_team_results %>% mutate(Team = rownames(brms_team_results)) %>% left_join(manual_BT_df) %>% select(Team, AvgTeamStr, bt_manual_coeffs) %>% 
  ggplot(aes(y = AvgTeamStr, x = bt_manual_coeffs)) +
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dotdash', color = 'red')+
  labs(y = 'BRMS Team Coefficients', x = 'BT Model Coefficients', 
       title = 'Relationship between BT and BRMS (Bayesian) Model Coefficients', subtitle = '2022-2024' ) +
  theme_minimal()

## Joining with `by = join_by(Team)`

## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Values are shrunk towards 0 compared to the BT coefficients.

brms_team_results %>% mutate(Team = rownames(brms_team_results)) %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>%  
  ggplot(aes(x = AvgTeamStr, y = win_rate)) +
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'BRMS Team Coefficients', y = 'Win Rates', 
       title = 'Relationship between Win Rates and BRMS (Bayesian) Model Coefficients', subtitle = '2022-2024' ) +
  theme_minimal()

## Joining with `by = join_by(Team)`

Correlate closely to base win rates as well.

Try with RStan

library(rstan)

## Loading required package: StanHeaders

## 
## rstan version 2.32.6 (Stan version 2.32.2)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)

stan_model_file <- 'stanmodel.stan'

## prepare the data as a list for stan 
stan_data <- list(
  N = nrow(match_data), 
  GoalDiff = match_data$GoalDiff, 
  N_home = length(unique(match_data$HomeTeam)),
  N_away = length(unique(match_data$AwayTeam)), 
  home_team = match_data$HomeTeamId, 
  away_team = match_data$AwayTeamId
)

## fir the model
stan_fit <- stan(file = stan_model_file, 
                 data = stan_data, 
                 chains = 4, 
                 iter = 1000, 
                 cores = 4)

## Trying to compile a simple C file

## Running /usr/lib/R/bin/R CMD SHLIB foo.c
## using C compiler: ‘gcc (Ubuntu 13.2.0-23ubuntu4) 13.2.0’
## gcc -I"/usr/share/R/include" -DNDEBUG   -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/Rcpp/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/unsupported"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/BH/include" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/src/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppParallel/include/"  -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1       -fpic  -g -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -ffile-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=. -fstack-protector-strong -fstack-clash-protection -Wformat -Werror=format-security -fcf-protection -fdebug-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=/usr/src/r-base-4.3.3-2build2 -Wdate-time -D_FORTIFY_SOURCE=3  -c foo.c -o foo.o
## In file included from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Core:19,
##                  from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Dense:1,
##                  from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
##                  from <command-line>:
## /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
##   679 | #include <cmath>
##       |          ^~~~~~~
## compilation terminated.
## make: *** [/usr/lib/R/etc/Makeconf:191: foo.o] Error 1

stan_posterior_samples <- rstan::extract(stan_fit)

## get team strengths
team_home_strs <- apply(stan_posterior_samples$u_home, 2, mean)
team_home_strs_CI <- apply(stan_posterior_samples$u_home, 2, quantile, probs = c(0.025, 0.975))
team_away_strs <- apply(stan_posterior_samples$u_away, 2, mean)
team_away_strs_CI <- apply(stan_posterior_samples$u_away, 2, quantile, probs = c(0.025, 0.975))

## Save into a dataframe 
home_team_effects <- data.frame(home_strs = team_home_strs, 
                                home_lower_CI = team_home_strs_CI['2.5%',], 
                                home_upper_CI = team_home_strs_CI[2, ])
rownames(home_team_effects) <- unique_teams
away_team_effects <- data.frame(away_strs = team_away_strs, 
                                away_lower_CI = team_away_strs_CI['2.5%',], 
                                away_upper_CI = team_away_strs_CI[2, ])
rownames(away_team_effects) <- unique_teams
## check 

home_team_effects %>% arrange(desc(home_strs))

##                    home_strs home_lower_CI home_upper_CI
## Man City          1.20276909     0.4250120     2.0187950
## Liverpool         1.10297438     0.2973999     1.9140953
## Arsenal           1.06172713     0.2570702     1.8988102
## Newcastle         0.91533987     0.1391405     1.7656584
## Aston Villa       0.60335784    -0.1982438     1.4214447
## Chelsea           0.51346493    -0.2321055     1.2663763
## Crystal Palace    0.22083224    -0.5880741     1.0266681
## Tottenham         0.21275674    -0.5747266     0.9541378
## Fulham            0.05922558    -0.7019831     0.8729361
## Everton          -0.06176248    -0.8634722     0.7476043
## West Ham         -0.11031044    -0.9680118     0.7131924
## Brighton         -0.11107885    -0.9094695     0.6949905
## Man United       -0.12297554    -0.9350764     0.6859345
## Bournemouth      -0.28621002    -1.0870858     0.5440928
## Nott'm Forest    -0.36893266    -1.2177348     0.4689164
## Wolves           -0.41764603    -1.2122198     0.3875816
## Brentford        -0.46590711    -1.2571540     0.2781749
## Luton            -0.61441055    -1.4151807     0.1909746
## Burnley          -1.26932887    -2.0880757    -0.4409534
## Sheffield United -1.83941180    -2.7005470    -0.9946844

away_team_effects %>% arrange((away_strs))

##                     away_strs away_lower_CI away_upper_CI
## Arsenal          -1.293987230   -2.09446939   -0.54265558
## Man City         -1.166498711   -1.96042415   -0.42849359
## Liverpool        -0.657741391   -1.40013123    0.09053806
## Tottenham        -0.275754774   -1.04312183    0.44716969
## Brentford        -0.080327329   -0.79991886    0.63264662
## Man United       -0.066249621   -0.76602276    0.68126375
## Chelsea          -0.047634604   -0.79786392    0.69215338
## Newcastle        -0.025322629   -0.74850956    0.73000046
## Aston Villa       0.003536685   -0.72387896    0.79477010
## Brighton          0.162001591   -0.53873221    0.85296282
## Wolves            0.190801713   -0.52387872    0.87551492
## Burnley           0.222749238   -0.52475927    0.98854728
## Bournemouth       0.226366600   -0.49293519    0.95997344
## Crystal Palace    0.244501025   -0.49106656    0.99810560
## Fulham            0.283740911   -0.42081841    1.02308058
## Nott'm Forest     0.330851247   -0.35846920    1.07265265
## Everton           0.343612752   -0.39822523    1.10825908
## West Ham          0.420119344   -0.27326218    1.15315177
## Luton             0.666450944   -0.08391235    1.51089213
## Sheffield United  0.879509749    0.11701554    1.69757091

home_team_effects <- home_team_effects %>%
  rename(
    Strength = home_strs,
    Lower_CI = home_lower_CI,
    Upper_CI = home_upper_CI
  ) %>%
  mutate(Location = "Home")

away_team_effects <- away_team_effects %>%
  rename(
    Strength = away_strs,
    Lower_CI = away_lower_CI,
    Upper_CI = away_upper_CI
  ) %>%
  mutate(Location = "Away")


# Combine the data frames
team_effects <- bind_rows(home_team_effects, away_team_effects)
team_names <- sapply(stringr::str_split(rownames(team_effects), pattern = stringr::fixed('...')), function(x)x[1])
team_effects['Team'] <- team_names
rownames(team_effects) <- NULL

## PLOT preparation
team_order <- team_effects %>% filter(Location == 'Home') %>% arrange(desc(Strength)) %>% pull(Team)

## Factorise the teams for ordering in the plots 
team_effects$Team <- factor(team_effects$Team, levels = team_order)

## PLOT NOW 
ggplot(team_effects, aes(x = Strength, y = Team, color = Location)) +
  geom_point() +
  geom_errorbar(aes(xmin = Lower_CI, xmax = Upper_CI)) +
  theme_minimal() + 
  labs(title = 'Home and Away Strengths', 
       subtitle = 'Away: more negative = stronger')

  theme(legend.position = 'bottom')

## List of 1
##  $ legend.position: chr "bottom"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Bradley Terry Model for EPL results

2024-10-07