Get the dataset by directly reading, via read.csv, the url that contains the csv files for the relevant dates. A function has been created to include more dates.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fbdata_base_url <- "https://www.football-data.co.uk/mmz4281/%s/E0.csv"
## DOWNLOAD EPL DATA FUNCTION
download_epl_data <- function(years){
combined_df <- data.frame()
for (year in years){
url <- sprintf(fbdata_base_url, year)
year_data <- read.csv(url)
combined_df <- rbind(year_data)
Sys.sleep(3)
}
return(combined_df)
}
years <- c('2223', '2324') ## get two seasons of data
combined_23_24 <- download_epl_data(years)
head(colnames(combined_23_24))
## [1] "Div" "Date" "Time" "HomeTeam" "AwayTeam" "FTHG"
match_data <- combined_23_24 %>% select(Date, HomeTeam, AwayTeam, FTHG, FTAG, FTR)
## check
head(match_data)
## Date HomeTeam AwayTeam FTHG FTAG FTR
## 1 11/08/2023 Burnley Man City 0 3 A
## 2 12/08/2023 Arsenal Nott'm Forest 2 1 H
## 3 12/08/2023 Bournemouth West Ham 1 1 D
## 4 12/08/2023 Brighton Luton 4 1 H
## 5 12/08/2023 Everton Fulham 0 1 A
## 6 12/08/2023 Sheffield United Crystal Palace 0 1 A
## Best team at home
home_team_win_rate <- match_data %>% group_by(HomeTeam) %>% summarise(home_win_rate = mean(FTR == 'H'))
away_team_win_rate <- match_data %>% group_by(AwayTeam) %>% summarise(away_win_rate = mean(FTR == 'A'))
home_away_win_rates <-home_team_win_rate %>% left_join(away_team_win_rate,
join_by('HomeTeam' == 'AwayTeam')) %>% rename(Team = HomeTeam) %>%
mutate('home_adv' = home_win_rate - away_win_rate) %>%
arrange(desc(home_win_rate))
home_away_win_rates%>%
ggplot(aes(x = reorder(Team, -home_win_rate))) +
geom_point(aes(y = home_win_rate, colour = 'Home Win Rate')) +
geom_point(aes(y = away_win_rate, colour = 'Away Win Rate')) +
geom_segment(aes(x = Team, xend = Team, y = home_win_rate, yend = away_win_rate),
color = "grey", linetype = "dotted", size = 1) +
labs(title = 'Home and Away win rates EPL teams',
x = 'Teams', y = 'Win Rates', color = 'Home/away')+
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Man City is the most consistent team home and away in winning, and that could be the reason for them being the defending champions. Liverpool and Arsenal have more variance, and perform less well in away games. For them to contest Man City for the title, it could be imperative for them to win their away games more often.
## Data preparation: convert team names to numeric indices
match_data$HomeTeamId <- as.integer(factor(match_data$HomeTeam))
match_data$AwayTeamId <- as.integer(factor(match_data$AwayTeam))
## verify
match_data %>% filter(HomeTeam == 'Liverpool') %>% pull(HomeTeamId) %>% unique()
## [1] 11
match_data %>% filter(AwayTeam == 'Liverpool') %>% pull(AwayTeamId) %>% unique()
## [1] 11
## Finding the main outcome variable -- goal difference
match_data['GoalDiff'] <- match_data$FTHG - match_data$FTAG
## Visualise
ggplot(match_data) + geom_bar(aes(x = GoalDiff),
fill = 'lightblue') +
scale_x_continuous(breaks = seq(min(match_data$GoalDiff),
max(match_data$GoalDiff),
by = 1)) +
labs(title = 'Distribution of goal difference (Home - Away)')+
theme_bw()
Home minus Away goals
Interestingly, goal difference is relatively normal looking, although the drop off is more steep when moving left of 0 than right.
A popular method for estimating team/player strengths from analysing pairwise datapoints (i.e. matches between two players). It assumes that each comparison is an independent datapoint, which is obviously contentious in most sporting scenarios.
unique_teams <- sort(unique(c(combined_23_24$Home,
combined_23_24$Away)))
## According to the blog, we need to get a specific data matrix
# such that for each row of the game data, we have a vector of n_teams
# where the element of the home team is 1 and that of away team is -1
# else its 0
## get dataframe for bradley terry model
get_data_vec <- function(home, away, unique_teams){
## init the vec -- matrix with indicator variable for the teams if theyre playing each other
vec <- rep(0, length(unique_teams))
## home teams: 1, away teams: -1
vec[unique_teams == home] <- 1
vec[unique_teams == away] <- -1
return(vec)
} ## this function applies to a single row of data and creates a corresponding row -- hence we need to iteratively apply across all rows of the data
X <- apply(match_data, 1,## 1 for row wise applying
function(row) get_data_vec(row['HomeTeam'],
row['AwayTeam'],
unique_teams))
## check
dim(X)
## [1] 20 380
X <- t(X) ## transpose such that every row now is a match
colnames(X) <- unique_teams
## Create Y = binary indicator for home team winning
y <- match_data$GoalDiff > 1
y <- as.integer(y)
## Recall that we have a lot of instances for draws. We can try
# considering draws as a loss for the home team, since they are supposed to have
# home advantage, hence all else equal it is a disappointing result for them
## prepare df
bt_df <- as.data.frame(cbind(X,y))
bt_model <- glm(y ~ ., data = bt_df, family = 'binomial')
## See results!
bt_results_df <- data.frame('coeff' = bt_model$coefficients)
bt_results_df <- bt_results_df %>% arrange(desc(coeff))
bt_results_df['Team'] <- rownames(bt_results_df); rownames(bt_results_df) <- NULL
bt_results_df <- bt_results_df %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>%
mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(coeff = ifelse(is.na(coeff), 0, coeff))
library(ggrepel)
bt_results_df %>%
ggplot(aes(x = reorder(Team, -coeff))) + # Reorder by coefficient descending
geom_segment(aes(x = Team, xend = Team, y = 0, yend = coeff),
color = "grey", linetype = "dotted") + # Vertical lines from 0 to coefficient
geom_point(aes(y = coeff), color = "blue", size = 3) + # Highlight points
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + # Rotate team names
labs(title = "Team Strength Coefficients from Bradley-Terry Model",
x = "Teams",
y = "Coefficient (Team Strength)") # Add informative labels
bt_results_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>%
ggplot(aes(x = win_rate, y = coeff)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'Win Rate', y = 'BT Model Coefficients',
title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
There is a general agreement between the win rates and the BT Model
coefficients.
The difference is that win rates is agnostic, whereas the BT model takes into account the strength of your opponents.
## prepare df
bt_model_no_incpt <- glm(y ~ . -1, data = bt_df, family = 'binomial')
## See results!
bt_results_df_no_intcpt <- data.frame('coeff' = bt_model_no_incpt$coefficients)
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% arrange(desc(coeff))
bt_results_df_no_intcpt['Team'] <- rownames(bt_results_df_no_intcpt); rownames(bt_results_df_no_intcpt) <- NULL
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>%
mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(Team = stringr::str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(coeff = ifelse(is.na(coeff), 0, coeff))
bt_results_df_no_intcpt %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>%
ggplot(aes(x = win_rate, y = coeff)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'Win Rate', y = 'BT Model Coefficients',
title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Create map and team indices
teams_index_map <- setNames(seq_along(unique_teams), unique_teams)
match_data <-match_data %>% mutate(
WinnerIdx = ifelse(FTR == 'H', teams_index_map[HomeTeam],
teams_index_map[AwayTeam]),
LoserIdx = ifelse(FTR == 'A', teams_index_map[HomeTeam],
teams_index_map[AwayTeam]))
## Create negative log likelihood function
neg_log_likelihood_func <- function(theta_rest, data) {
theta <- c(theta_rest, 0) ## set last team's theta to 0
winnerIdx <- data$WinnerIdx
loserIdx <- data$LoserIdx
lin_pred <- theta[winnerIdx] - theta[loserIdx]
prob <- (1+exp(-lin_pred))^-1 ##logistic formula
log_prob <- log(prob)
total_log_lik = sum(log_prob, na.rm = T)## iteratively add to the total log likelihood
return(-total_log_lik)
}
## Init the thetas that are going to be updated
init_thetas <- rep(0, length(teams_index_map) -1)
## OPTIMISE!
manual_BT_results <- optim(par = init_thetas, fn = neg_log_likelihood_func, data = match_data, method = 'L-BFGS')
manual_BT_coeffs <- manual_BT_results$par
manual_BT_df <- data.frame(Team = names(teams_index_map),
bt_manual_coeffs = c(manual_BT_coeffs, 0)) %>% arrange(desc(bt_manual_coeffs))
manual_BT_df
## Team bt_manual_coeffs
## 1 Man City 2.45031181
## 2 Arsenal 2.04181355
## 3 Liverpool 1.97081368
## 4 Aston Villa 1.12632319
## 5 Tottenham 0.77543920
## 6 Chelsea 0.69566489
## 7 Newcastle 0.64269216
## 8 Man United 0.52776517
## 9 West Ham 0.29222555
## 10 Brighton 0.29082672
## 11 Crystal Palace 0.20154869
## 12 Everton 0.09064414
## 13 Bournemouth 0.07688813
## 14 Fulham 0.07016092
## 15 Wolves 0.00000000
## 16 Nott'm Forest -0.42440288
## 17 Brentford -0.54727382
## 18 Luton -1.27204220
## 19 Burnley -1.50094394
## 20 Sheffield United -2.25301390
manual_BT_df %>%
ggplot(aes(x = reorder(Team, -bt_manual_coeffs))) + # Reorder by coefficient descending
geom_segment(aes(x = Team, xend = Team, y = 0, yend = bt_manual_coeffs),
color = "grey", linetype = "dotted") + # Vertical lines from 0 to coefficient
geom_point(aes(y = bt_manual_coeffs), color = "blue", size = 3) + # Highlight points
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + # Rotate team names
labs(title = "Team Strength Coefficients from MLE (BT Model)",
x = "Teams",
y = "Coefficient (Team Strength)") # Add informative labels
This is an improvement from the previous method that uses GLM, as this
prevents teams from having the same results. This also naturally
excludes the home advantage parameter/ does not factor in home
advantage.
manual_BT_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, bt_manual_coeffs, win_rate) %>%
ggplot(aes(x = win_rate, y = bt_manual_coeffs)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'Win Rate', y = 'BT Model Coefficients',
title = 'Relationship between win base rate and the Terry Bradley coefficients (No intercept)', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
The agreement is now better. In particular, the model correctly spots
that Arsenal has performed slightly better than Liverpool.
\[ Y_i = \text{Home Score}_i - \text{Away Score}_i \] \[ Y_i \sim \mathcal{N} \left( (\alpha_{h_i} - \alpha_{a_i}) + \theta, \sigma^2 \right) \]
\[ \alpha_j \sim \mathcal{N}(\mu_{\alpha}, \sigma_{\alpha}^2) \]
\[ \theta \sim \mathcal{N}(0, \sigma_{\theta}^2) \]
\[ \mu_{\alpha} \sim \mathcal{N}(0, 1) \]
\[ \sigma_{\alpha}^2 \sim \text{Half-Cauchy}(0, 1) \]
\[ \sigma^2 \sim \text{Inverse-Gamma}(2, 2) \]
\[ Y_i \sim \mathcal{N} \left( (\alpha_{h_i} - \alpha_{a_i}) + \theta, \sigma^2 \right) \]
For the Bayesian Modelling, we try the package BRMS
.
library(brms) ## package for bayesian modelling
## Loading required package: Rcpp
## Loading 'brms' package (version 2.22.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
##
## ar
## fitting the bayesian model
bayesian_fit <- brm(
formula = GoalDiff ~ 1 + (1|HomeTeam) + (1| AwayTeam),
data = match_data,
family = gaussian(),
prior = c(
prior(normal(0, 1), class = "Intercept"), # Prior for the home advantage (intercept)
prior(normal(0, 1), class = "sd"), # Prior for mu_alpha (mean of team strengths)
prior(cauchy(0, 1), class = "sd", group = "HomeTeam"), # Prior for sigma_alpha (variance of team strengths)
prior(cauchy(0, 1), class = "sd", group = "AwayTeam"), # Prior for sigma_alpha for away teams
prior(inv_gamma(2, 2), class = "sigma") # Prior for observation error variance (sigma^2)
),
iter = 2000,
chains = 4)
## Compiling Stan program...
## Trying to compile a simple C file
## Running /usr/lib/R/bin/R CMD SHLIB foo.c
## using C compiler: ‘gcc (Ubuntu 13.2.0-23ubuntu4) 13.2.0’
## gcc -I"/usr/share/R/include" -DNDEBUG -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/Rcpp/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/unsupported" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/BH/include" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/src/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppParallel/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -fpic -g -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -ffile-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=. -fstack-protector-strong -fstack-clash-protection -Wformat -Werror=format-security -fcf-protection -fdebug-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=/usr/src/r-base-4.3.3-2build2 -Wdate-time -D_FORTIFY_SOURCE=3 -c foo.c -o foo.o
## In file included from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Core:19,
## from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Dense:1,
## from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
## from <command-line>:
## /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
## 679 | #include <cmath>
## | ^~~~~~~
## compilation terminated.
## make: *** [/usr/lib/R/etc/Makeconf:191: foo.o] Error 1
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 4.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.46 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.328 seconds (Warm-up)
## Chain 1: 0.246 seconds (Sampling)
## Chain 1: 0.574 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 3.5e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.35 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.315 seconds (Warm-up)
## Chain 2: 0.247 seconds (Sampling)
## Chain 2: 0.562 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 2.1e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.21 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.322 seconds (Warm-up)
## Chain 3: 0.246 seconds (Sampling)
## Chain 3: 0.568 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 2.7e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.27 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.31 seconds (Warm-up)
## Chain 4: 0.247 seconds (Sampling)
## Chain 4: 0.557 seconds (Total)
## Chain 4:
summary(bayesian_fit)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: GoalDiff ~ 1 + (1 | HomeTeam) + (1 | AwayTeam)
## Data: match_data (Number of observations: 380)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Multilevel Hyperparameters:
## ~AwayTeam (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.67 0.16 0.40 1.03 1.00 1529 2116
##
## ~HomeTeam (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.89 0.18 0.59 1.31 1.00 1690 2240
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 0.30 0.26 -0.22 0.80 1.00 1731 2390
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 1.77 0.07 1.65 1.91 1.00 6422 2950
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(bayesian_fit)
away_team_results <- data.frame(ranef(bayesian_fit)$AwayTeam) %>% rename(AwayTeamStr = Estimate.Intercept,
AwayTeamStrError = Est.Error.Intercept) %>%
mutate(AwayTeamStr = -AwayTeamStr)
home_team_results <- data.frame(ranef(bayesian_fit)$HomeTeam) %>% rename(HomeTeamStr = Estimate.Intercept,
HomeTeamStrError = Est.Error.Intercept)
team_results <- cbind(home_team_results, away_team_results) %>% select(HomeTeamStr, AwayTeamStr)
brms_team_results <- team_results %>% arrange(desc(HomeTeamStr), desc(AwayTeamStr)) %>% mutate(AvgTeamStr = 0.5*(HomeTeamStr+AwayTeamStr))
brms_team_results %>% mutate(Team = rownames(brms_team_results)) %>% left_join(manual_BT_df) %>% select(Team, AvgTeamStr, bt_manual_coeffs) %>%
ggplot(aes(y = AvgTeamStr, x = bt_manual_coeffs)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dotdash', color = 'red')+
labs(y = 'BRMS Team Coefficients', x = 'BT Model Coefficients',
title = 'Relationship between BT and BRMS (Bayesian) Model Coefficients', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Values are shrunk towards 0 compared to the BT coefficients.
brms_team_results %>% mutate(Team = rownames(brms_team_results)) %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>%
ggplot(aes(x = AvgTeamStr, y = win_rate)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'BRMS Team Coefficients', y = 'Win Rates',
title = 'Relationship between Win Rates and BRMS (Bayesian) Model Coefficients', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
Correlate closely to base win rates as well.
library(rstan)
## Loading required package: StanHeaders
##
## rstan version 2.32.6 (Stan version 2.32.2)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)
stan_model_file <- 'stanmodel.stan'
## prepare the data as a list for stan
stan_data <- list(
N = nrow(match_data),
GoalDiff = match_data$GoalDiff,
N_home = length(unique(match_data$HomeTeam)),
N_away = length(unique(match_data$AwayTeam)),
home_team = match_data$HomeTeamId,
away_team = match_data$AwayTeamId
)
## fir the model
stan_fit <- stan(file = stan_model_file,
data = stan_data,
chains = 4,
iter = 1000,
cores = 4)
## Trying to compile a simple C file
## Running /usr/lib/R/bin/R CMD SHLIB foo.c
## using C compiler: ‘gcc (Ubuntu 13.2.0-23ubuntu4) 13.2.0’
## gcc -I"/usr/share/R/include" -DNDEBUG -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/Rcpp/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/unsupported" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/BH/include" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/src/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppParallel/include/" -I"/home/itan/R/x86_64-pc-linux-gnu-library/4.3/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -fpic -g -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -ffile-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=. -fstack-protector-strong -fstack-clash-protection -Wformat -Werror=format-security -fcf-protection -fdebug-prefix-map=/build/r-base-FPSnzf/r-base-4.3.3=/usr/src/r-base-4.3.3-2build2 -Wdate-time -D_FORTIFY_SOURCE=3 -c foo.c -o foo.o
## In file included from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Core:19,
## from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/Dense:1,
## from /home/itan/R/x86_64-pc-linux-gnu-library/4.3/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
## from <command-line>:
## /home/itan/R/x86_64-pc-linux-gnu-library/4.3/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
## 679 | #include <cmath>
## | ^~~~~~~
## compilation terminated.
## make: *** [/usr/lib/R/etc/Makeconf:191: foo.o] Error 1
stan_posterior_samples <- rstan::extract(stan_fit)
## get team strengths
team_home_strs <- apply(stan_posterior_samples$u_home, 2, mean)
team_home_strs_CI <- apply(stan_posterior_samples$u_home, 2, quantile, probs = c(0.025, 0.975))
team_away_strs <- apply(stan_posterior_samples$u_away, 2, mean)
team_away_strs_CI <- apply(stan_posterior_samples$u_away, 2, quantile, probs = c(0.025, 0.975))
## Save into a dataframe
home_team_effects <- data.frame(home_strs = team_home_strs,
home_lower_CI = team_home_strs_CI['2.5%',],
home_upper_CI = team_home_strs_CI[2, ])
rownames(home_team_effects) <- unique_teams
away_team_effects <- data.frame(away_strs = team_away_strs,
away_lower_CI = team_away_strs_CI['2.5%',],
away_upper_CI = team_away_strs_CI[2, ])
rownames(away_team_effects) <- unique_teams
## check
home_team_effects %>% arrange(desc(home_strs))
## home_strs home_lower_CI home_upper_CI
## Man City 1.20276909 0.4250120 2.0187950
## Liverpool 1.10297438 0.2973999 1.9140953
## Arsenal 1.06172713 0.2570702 1.8988102
## Newcastle 0.91533987 0.1391405 1.7656584
## Aston Villa 0.60335784 -0.1982438 1.4214447
## Chelsea 0.51346493 -0.2321055 1.2663763
## Crystal Palace 0.22083224 -0.5880741 1.0266681
## Tottenham 0.21275674 -0.5747266 0.9541378
## Fulham 0.05922558 -0.7019831 0.8729361
## Everton -0.06176248 -0.8634722 0.7476043
## West Ham -0.11031044 -0.9680118 0.7131924
## Brighton -0.11107885 -0.9094695 0.6949905
## Man United -0.12297554 -0.9350764 0.6859345
## Bournemouth -0.28621002 -1.0870858 0.5440928
## Nott'm Forest -0.36893266 -1.2177348 0.4689164
## Wolves -0.41764603 -1.2122198 0.3875816
## Brentford -0.46590711 -1.2571540 0.2781749
## Luton -0.61441055 -1.4151807 0.1909746
## Burnley -1.26932887 -2.0880757 -0.4409534
## Sheffield United -1.83941180 -2.7005470 -0.9946844
away_team_effects %>% arrange((away_strs))
## away_strs away_lower_CI away_upper_CI
## Arsenal -1.293987230 -2.09446939 -0.54265558
## Man City -1.166498711 -1.96042415 -0.42849359
## Liverpool -0.657741391 -1.40013123 0.09053806
## Tottenham -0.275754774 -1.04312183 0.44716969
## Brentford -0.080327329 -0.79991886 0.63264662
## Man United -0.066249621 -0.76602276 0.68126375
## Chelsea -0.047634604 -0.79786392 0.69215338
## Newcastle -0.025322629 -0.74850956 0.73000046
## Aston Villa 0.003536685 -0.72387896 0.79477010
## Brighton 0.162001591 -0.53873221 0.85296282
## Wolves 0.190801713 -0.52387872 0.87551492
## Burnley 0.222749238 -0.52475927 0.98854728
## Bournemouth 0.226366600 -0.49293519 0.95997344
## Crystal Palace 0.244501025 -0.49106656 0.99810560
## Fulham 0.283740911 -0.42081841 1.02308058
## Nott'm Forest 0.330851247 -0.35846920 1.07265265
## Everton 0.343612752 -0.39822523 1.10825908
## West Ham 0.420119344 -0.27326218 1.15315177
## Luton 0.666450944 -0.08391235 1.51089213
## Sheffield United 0.879509749 0.11701554 1.69757091
home_team_effects <- home_team_effects %>%
rename(
Strength = home_strs,
Lower_CI = home_lower_CI,
Upper_CI = home_upper_CI
) %>%
mutate(Location = "Home")
away_team_effects <- away_team_effects %>%
rename(
Strength = away_strs,
Lower_CI = away_lower_CI,
Upper_CI = away_upper_CI
) %>%
mutate(Location = "Away")
# Combine the data frames
team_effects <- bind_rows(home_team_effects, away_team_effects)
team_names <- sapply(stringr::str_split(rownames(team_effects), pattern = stringr::fixed('...')), function(x)x[1])
team_effects['Team'] <- team_names
rownames(team_effects) <- NULL
## PLOT preparation
team_order <- team_effects %>% filter(Location == 'Home') %>% arrange(desc(Strength)) %>% pull(Team)
## Factorise the teams for ordering in the plots
team_effects$Team <- factor(team_effects$Team, levels = team_order)
## PLOT NOW
ggplot(team_effects, aes(x = Strength, y = Team, color = Location)) +
geom_point() +
geom_errorbar(aes(xmin = Lower_CI, xmax = Upper_CI)) +
theme_minimal() +
labs(title = 'Home and Away Strengths',
subtitle = 'Away: more negative = stronger')
theme(legend.position = 'bottom')
## List of 1
## $ legend.position: chr "bottom"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE