Data: Football-data.co.uk

Get the dataset by directly reading, via read.csv, the url that contains the csv files for the relevant dates. A function has been created to include more dates.

library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
fbdata_base_url <- "https://www.football-data.co.uk/mmz4281/%s/E0.csv"

## DOWNLOAD EPL DATA FUNCTION 
download_epl_data <- function(years){
  combined_df <- data.frame()
  
  for (year in years){
    url <- sprintf(fbdata_base_url, year)
    year_data <- read.csv(url)
    combined_df <- rbind(year_data)
    
    Sys.sleep(3)
  }
  return(combined_df)
}

years <- c('2223', '2324') ## get two seasons of data
combined_23_24 <- download_epl_data(years)
head(colnames(combined_23_24))
## [1] "Div"      "Date"     "Time"     "HomeTeam" "AwayTeam" "FTHG"
match_data <- combined_23_24 %>% select(Date, HomeTeam, AwayTeam, FTHG, FTAG, FTR)
## check
head(match_data)
##         Date         HomeTeam       AwayTeam FTHG FTAG FTR
## 1 11/08/2023          Burnley       Man City    0    3   A
## 2 12/08/2023          Arsenal  Nott'm Forest    2    1   H
## 3 12/08/2023      Bournemouth       West Ham    1    1   D
## 4 12/08/2023         Brighton          Luton    4    1   H
## 5 12/08/2023          Everton         Fulham    0    1   A
## 6 12/08/2023 Sheffield United Crystal Palace    0    1   A

Basic analysis: Home and Away win rates

## Best team at home 
home_team_win_rate  <- match_data %>% group_by(HomeTeam) %>% summarise(home_win_rate = mean(FTR == 'H'))
away_team_win_rate <- match_data %>% group_by(AwayTeam) %>% summarise(away_win_rate = mean(FTR == 'A'))

home_away_win_rates <-home_team_win_rate %>% left_join(away_team_win_rate, 
                                join_by('HomeTeam' == 'AwayTeam')) %>% rename(Team = HomeTeam) %>% 
  mutate('home_adv' = home_win_rate - away_win_rate) %>% 
  arrange(desc(home_win_rate)) 

home_away_win_rates%>% 
  ggplot(aes(x = reorder(Team, -home_win_rate))) + 
  geom_point(aes(y = home_win_rate, colour = 'Home Win Rate')) +
  geom_point(aes(y = away_win_rate, colour = 'Away Win Rate')) +
geom_segment(aes(x = Team, xend = Team, y = home_win_rate, yend = away_win_rate), 
               color = "grey", linetype = "dotted", size = 1) +
  labs(title = 'Home and Away win rates EPL teams', 
       x = 'Teams', y = 'Win Rates', color = 'Home/away')+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Man City is the most consistent team home and away in winning, and that could be the reason for them being the defending champions. Liverpool and Arsenal have more variance, and perform less well in away games. For them to contest Man City for the title, it could be imperative for them to win their away games more often.

## Data preparation: convert team names to numeric indices
match_data$HomeTeamId <- as.integer(factor(match_data$HomeTeam))
match_data$AwayTeamId <- as.integer(factor(match_data$AwayTeam))
## verify 
match_data %>% filter(HomeTeam == 'Liverpool') %>% pull(HomeTeamId) %>% unique()
## [1] 11
match_data %>% filter(AwayTeam == 'Liverpool') %>% pull(AwayTeamId) %>% unique()
## [1] 11
## Finding the main outcome variable -- goal difference 
match_data['GoalDiff'] <- match_data$FTHG - match_data$FTAG

Looking at goal difference

## Visualise 
ggplot(match_data) + geom_bar(aes(x = GoalDiff), 
                              fill = 'lightblue') + 
  scale_x_continuous(breaks = seq(min(match_data$GoalDiff),
                                  max(match_data$GoalDiff),
                                  by = 1)) +
  labs(title = 'Distribution of goal difference (Home - Away)')+
  theme_bw()
Home minus Away goals

Home minus Away goals

Interestingly, goal difference is relatively normal looking, although the drop off is more steep when moving left of 0 than right.

1. Terry Bradley Model : win/loss, score not taken into account

A popular method for estimating team/player strengths from analysing pairwise datapoints (i.e. matches between two players). It assumes that each comparison is an independent datapoint, which is obviously contentious in most sporting scenarios.

Link to R Blogpost

unique_teams <- sort(unique(c(combined_23_24$Home, 
                              combined_23_24$Away)))
## According to the blog, we need to get a specific data matrix
# such that for each row of the game data, we have a vector of n_teams
# where the element of the home team is 1 and that of away team is -1
# else its 0

## get dataframe for bradley terry model
get_data_vec <- function(home, away, unique_teams){
  ## init the vec -- matrix with indicator variable for the teams if theyre playing each other 
  vec <- rep(0, length(unique_teams))
  
  ## home teams: 1, away teams: -1
  vec[unique_teams == home] <- 1
  vec[unique_teams == away] <- -1 
  return(vec)
} ## this function applies to a single row of data and creates a corresponding row -- hence we need to iteratively apply across all rows of the data

X <- apply(match_data, 1,## 1 for row wise applying  
           function(row) get_data_vec(row['HomeTeam'],
                                      row['AwayTeam'], 
                                      unique_teams))
## check
dim(X)
## [1]  20 380
X <- t(X) ## transpose such that every row now is a match
colnames(X) <- unique_teams
## Create Y = binary indicator for home team winning 
y <- match_data$GoalDiff > 1
y <- as.integer(y)
## Recall that we have a lot of instances for draws. We can try
# considering draws as a loss for the home team, since they are supposed to have 
# home advantage, hence all else equal it is a disappointing result for them 
## prepare df 
bt_df <- as.data.frame(cbind(X,y))
bt_model <- glm(y ~ ., data = bt_df, family = 'binomial')

## See results!
bt_results_df <- data.frame('coeff' = bt_model$coefficients)
bt_results_df <- bt_results_df %>% arrange(desc(coeff))
bt_results_df['Team'] <- rownames(bt_results_df); rownames(bt_results_df) <- NULL
bt_results_df <- bt_results_df %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>% 
  mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(coeff = ifelse(is.na(coeff), 0, coeff))
library(ggrepel)
bt_results_df %>%
  ggplot(aes(x = reorder(Team, -coeff))) +  # Reorder by coefficient descending
  geom_segment(aes(x = Team, xend = Team, y = 0, yend = coeff), 
               color = "grey", linetype = "dotted") +  # Vertical lines from 0 to coefficient
  geom_point(aes(y = coeff), color = "blue", size = 3) +  # Highlight points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +  # Rotate team names
  labs(title = "Team Strength Coefficients from Bradley-Terry Model",
       x = "Teams",
       y = "Coefficient (Team Strength)")  # Add informative labels

Compare results of BT model with base win rates for teams

bt_results_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>% 
  ggplot(aes(x = win_rate, y = coeff)) + 
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'Win Rate', y = 'BT Model Coefficients', 
       title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
  theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).

There is a general agreement between the win rates and the BT Model coefficients.

The difference is that win rates is agnostic, whereas the BT model takes into account the strength of your opponents.

What if we remove the intercept?

## prepare df 
bt_model_no_incpt <- glm(y ~ . -1, data = bt_df, family = 'binomial')

## See results!
bt_results_df_no_intcpt <- data.frame('coeff' = bt_model_no_incpt$coefficients)
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% arrange(desc(coeff))
bt_results_df_no_intcpt['Team'] <- rownames(bt_results_df_no_intcpt); rownames(bt_results_df_no_intcpt) <- NULL
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>% 
  mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>% 
  mutate(coeff = ifelse(is.na(coeff), 0, coeff))
bt_results_df_no_intcpt %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>% 
  ggplot(aes(x = win_rate, y = coeff)) + 
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'Win Rate', y = 'BT Model Coefficients', 
       title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
  theme_minimal()
## Joining with `by = join_by(Team)`

Manually solve for BT model - via Maximum Likelihood Estimation

## Create map and team indices 
teams_index_map <- setNames(seq_along(unique_teams), unique_teams)
match_data <-match_data %>% mutate(
  WinnerIdx = ifelse(FTR == 'H', teams_index_map[HomeTeam], 
                                         teams_index_map[AwayTeam]), 
  LoserIdx = ifelse(FTR == 'A', teams_index_map[HomeTeam], 
                                         teams_index_map[AwayTeam]))
## Create negative log likelihood function 
neg_log_likelihood_func <- function(theta_rest, data) {
  theta <- c(theta_rest, 0) ## set last team's theta to 0
  winnerIdx <- data$WinnerIdx
  loserIdx <- data$LoserIdx
  lin_pred <- theta[winnerIdx] - theta[loserIdx]
  prob <- (1+exp(-lin_pred))^-1 ##logistic formula 
  log_prob <- log(prob)
  total_log_lik =  sum(log_prob, na.rm = T)## iteratively add to the total log likelihood

  return(-total_log_lik)
}


## Init the thetas that are going to be updated 
init_thetas <- rep(0, length(teams_index_map) -1)

## OPTIMISE!
manual_BT_results <- optim(par = init_thetas, fn = neg_log_likelihood_func, data = match_data, method = 'L-BFGS')
manual_BT_coeffs <- manual_BT_results$par
manual_BT_df <- data.frame(Team = names(teams_index_map), 
                           bt_manual_coeffs = c(manual_BT_coeffs, 0)) %>% arrange(desc(bt_manual_coeffs))
manual_BT_df
##                Team bt_manual_coeffs
## 1          Man City       2.45031181
## 2           Arsenal       2.04181355
## 3         Liverpool       1.97081368
## 4       Aston Villa       1.12632319
## 5         Tottenham       0.77543920
## 6           Chelsea       0.69566489
## 7         Newcastle       0.64269216
## 8        Man United       0.52776517
## 9          West Ham       0.29222555
## 10         Brighton       0.29082672
## 11   Crystal Palace       0.20154869
## 12          Everton       0.09064414
## 13      Bournemouth       0.07688813
## 14           Fulham       0.07016092
## 15           Wolves       0.00000000
## 16    Nott'm Forest      -0.42440288
## 17        Brentford      -0.54727382
## 18            Luton      -1.27204220
## 19          Burnley      -1.50094394
## 20 Sheffield United      -2.25301390
manual_BT_df %>%
  ggplot(aes(x = reorder(Team, -bt_manual_coeffs))) +  # Reorder by coefficient descending
  geom_segment(aes(x = Team, xend = Team, y = 0, yend = bt_manual_coeffs), 
               color = "grey", linetype = "dotted") +  # Vertical lines from 0 to coefficient
  geom_point(aes(y = bt_manual_coeffs), color = "blue", size = 3) +  # Highlight points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +  # Rotate team names
  labs(title = "Team Strength Coefficients from MLE (BT Model)",
       x = "Teams",
       y = "Coefficient (Team Strength)")  # Add informative labels

This is an improvement from the previous method that uses GLM, as this prevents teams from having the same results. This also naturally excludes the home advantage parameter/ does not factor in home advantage.

Compare results of BT model (MANUAL MLE) with base win rates for teams

manual_BT_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, bt_manual_coeffs, win_rate) %>% 
  ggplot(aes(x = win_rate, y = bt_manual_coeffs)) + 
  geom_point() +
  geom_text_repel(aes(label = Team)) +
  labs(x = 'Win Rate', y = 'BT Model Coefficients', 
       title = 'Relationship between win base rate and the Terry Bradley coefficients (No intercept)', subtitle = '2022-2024' ) +
  theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

The agreement is now better. In particular, the model correctly spots that Arsenal has performed slightly better than Liverpool.