Get the dataset by directly reading, via read.csv, the url that contains the csv files for the relevant dates. A function has been created to include more dates.
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
fbdata_base_url <- "https://www.football-data.co.uk/mmz4281/%s/E0.csv"
## DOWNLOAD EPL DATA FUNCTION
download_epl_data <- function(years){
combined_df <- data.frame()
for (year in years){
url <- sprintf(fbdata_base_url, year)
year_data <- read.csv(url)
combined_df <- rbind(year_data)
Sys.sleep(3)
}
return(combined_df)
}
years <- c('2223', '2324') ## get two seasons of data
combined_23_24 <- download_epl_data(years)
head(colnames(combined_23_24))
## [1] "Div" "Date" "Time" "HomeTeam" "AwayTeam" "FTHG"
match_data <- combined_23_24 %>% select(Date, HomeTeam, AwayTeam, FTHG, FTAG, FTR)
## check
head(match_data)
## Date HomeTeam AwayTeam FTHG FTAG FTR
## 1 11/08/2023 Burnley Man City 0 3 A
## 2 12/08/2023 Arsenal Nott'm Forest 2 1 H
## 3 12/08/2023 Bournemouth West Ham 1 1 D
## 4 12/08/2023 Brighton Luton 4 1 H
## 5 12/08/2023 Everton Fulham 0 1 A
## 6 12/08/2023 Sheffield United Crystal Palace 0 1 A
## Best team at home
home_team_win_rate <- match_data %>% group_by(HomeTeam) %>% summarise(home_win_rate = mean(FTR == 'H'))
away_team_win_rate <- match_data %>% group_by(AwayTeam) %>% summarise(away_win_rate = mean(FTR == 'A'))
home_away_win_rates <-home_team_win_rate %>% left_join(away_team_win_rate,
join_by('HomeTeam' == 'AwayTeam')) %>% rename(Team = HomeTeam) %>%
mutate('home_adv' = home_win_rate - away_win_rate) %>%
arrange(desc(home_win_rate))
home_away_win_rates%>%
ggplot(aes(x = reorder(Team, -home_win_rate))) +
geom_point(aes(y = home_win_rate, colour = 'Home Win Rate')) +
geom_point(aes(y = away_win_rate, colour = 'Away Win Rate')) +
geom_segment(aes(x = Team, xend = Team, y = home_win_rate, yend = away_win_rate),
color = "grey", linetype = "dotted", size = 1) +
labs(title = 'Home and Away win rates EPL teams',
x = 'Teams', y = 'Win Rates', color = 'Home/away')+
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Man City is the most consistent team home and away in winning, and that could be the reason for them being the defending champions. Liverpool and Arsenal have more variance, and perform less well in away games. For them to contest Man City for the title, it could be imperative for them to win their away games more often.
## Data preparation: convert team names to numeric indices
match_data$HomeTeamId <- as.integer(factor(match_data$HomeTeam))
match_data$AwayTeamId <- as.integer(factor(match_data$AwayTeam))
## verify
match_data %>% filter(HomeTeam == 'Liverpool') %>% pull(HomeTeamId) %>% unique()
## [1] 11
match_data %>% filter(AwayTeam == 'Liverpool') %>% pull(AwayTeamId) %>% unique()
## [1] 11
## Finding the main outcome variable -- goal difference
match_data['GoalDiff'] <- match_data$FTHG - match_data$FTAG
## Visualise
ggplot(match_data) + geom_bar(aes(x = GoalDiff),
fill = 'lightblue') +
scale_x_continuous(breaks = seq(min(match_data$GoalDiff),
max(match_data$GoalDiff),
by = 1)) +
labs(title = 'Distribution of goal difference (Home - Away)')+
theme_bw()
Home minus Away goals
Interestingly, goal difference is relatively normal looking, although the drop off is more steep when moving left of 0 than right.
A popular method for estimating team/player strengths from analysing pairwise datapoints (i.e. matches between two players). It assumes that each comparison is an independent datapoint, which is obviously contentious in most sporting scenarios.
unique_teams <- sort(unique(c(combined_23_24$Home,
combined_23_24$Away)))
## According to the blog, we need to get a specific data matrix
# such that for each row of the game data, we have a vector of n_teams
# where the element of the home team is 1 and that of away team is -1
# else its 0
## get dataframe for bradley terry model
get_data_vec <- function(home, away, unique_teams){
## init the vec -- matrix with indicator variable for the teams if theyre playing each other
vec <- rep(0, length(unique_teams))
## home teams: 1, away teams: -1
vec[unique_teams == home] <- 1
vec[unique_teams == away] <- -1
return(vec)
} ## this function applies to a single row of data and creates a corresponding row -- hence we need to iteratively apply across all rows of the data
X <- apply(match_data, 1,## 1 for row wise applying
function(row) get_data_vec(row['HomeTeam'],
row['AwayTeam'],
unique_teams))
## check
dim(X)
## [1] 20 380
X <- t(X) ## transpose such that every row now is a match
colnames(X) <- unique_teams
## Create Y = binary indicator for home team winning
y <- match_data$GoalDiff > 1
y <- as.integer(y)
## Recall that we have a lot of instances for draws. We can try
# considering draws as a loss for the home team, since they are supposed to have
# home advantage, hence all else equal it is a disappointing result for them
## prepare df
bt_df <- as.data.frame(cbind(X,y))
bt_model <- glm(y ~ ., data = bt_df, family = 'binomial')
## See results!
bt_results_df <- data.frame('coeff' = bt_model$coefficients)
bt_results_df <- bt_results_df %>% arrange(desc(coeff))
bt_results_df['Team'] <- rownames(bt_results_df); rownames(bt_results_df) <- NULL
bt_results_df <- bt_results_df %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>%
mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(coeff = ifelse(is.na(coeff), 0, coeff))
library(ggrepel)
bt_results_df %>%
ggplot(aes(x = reorder(Team, -coeff))) + # Reorder by coefficient descending
geom_segment(aes(x = Team, xend = Team, y = 0, yend = coeff),
color = "grey", linetype = "dotted") + # Vertical lines from 0 to coefficient
geom_point(aes(y = coeff), color = "blue", size = 3) + # Highlight points
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + # Rotate team names
labs(title = "Team Strength Coefficients from Bradley-Terry Model",
x = "Teams",
y = "Coefficient (Team Strength)") # Add informative labels
bt_results_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>%
ggplot(aes(x = win_rate, y = coeff)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'Win Rate', y = 'BT Model Coefficients',
title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
There is a general agreement between the win rates and the BT Model
coefficients.
The difference is that win rates is agnostic, whereas the BT model takes into account the strength of your opponents.
## prepare df
bt_model_no_incpt <- glm(y ~ . -1, data = bt_df, family = 'binomial')
## See results!
bt_results_df_no_intcpt <- data.frame('coeff' = bt_model_no_incpt$coefficients)
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% arrange(desc(coeff))
bt_results_df_no_intcpt['Team'] <- rownames(bt_results_df_no_intcpt); rownames(bt_results_df_no_intcpt) <- NULL
bt_results_df_no_intcpt <- bt_results_df_no_intcpt %>% mutate(Team = ifelse(Team == '(Intercept)', 'HomeAdv', Team)) %>%
mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(Team = str_replace(Team, pattern = '`', replacement = '')) %>%
mutate(coeff = ifelse(is.na(coeff), 0, coeff))
bt_results_df_no_intcpt %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, coeff, win_rate) %>%
ggplot(aes(x = win_rate, y = coeff)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'Win Rate', y = 'BT Model Coefficients',
title = 'Relationship between win base rate and the Terry Bradley coefficients', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Create map and team indices
teams_index_map <- setNames(seq_along(unique_teams), unique_teams)
match_data <-match_data %>% mutate(
WinnerIdx = ifelse(FTR == 'H', teams_index_map[HomeTeam],
teams_index_map[AwayTeam]),
LoserIdx = ifelse(FTR == 'A', teams_index_map[HomeTeam],
teams_index_map[AwayTeam]))
## Create negative log likelihood function
neg_log_likelihood_func <- function(theta_rest, data) {
theta <- c(theta_rest, 0) ## set last team's theta to 0
winnerIdx <- data$WinnerIdx
loserIdx <- data$LoserIdx
lin_pred <- theta[winnerIdx] - theta[loserIdx]
prob <- (1+exp(-lin_pred))^-1 ##logistic formula
log_prob <- log(prob)
total_log_lik = sum(log_prob, na.rm = T)## iteratively add to the total log likelihood
return(-total_log_lik)
}
## Init the thetas that are going to be updated
init_thetas <- rep(0, length(teams_index_map) -1)
## OPTIMISE!
manual_BT_results <- optim(par = init_thetas, fn = neg_log_likelihood_func, data = match_data, method = 'L-BFGS')
manual_BT_coeffs <- manual_BT_results$par
manual_BT_df <- data.frame(Team = names(teams_index_map),
bt_manual_coeffs = c(manual_BT_coeffs, 0)) %>% arrange(desc(bt_manual_coeffs))
manual_BT_df
## Team bt_manual_coeffs
## 1 Man City 2.45031181
## 2 Arsenal 2.04181355
## 3 Liverpool 1.97081368
## 4 Aston Villa 1.12632319
## 5 Tottenham 0.77543920
## 6 Chelsea 0.69566489
## 7 Newcastle 0.64269216
## 8 Man United 0.52776517
## 9 West Ham 0.29222555
## 10 Brighton 0.29082672
## 11 Crystal Palace 0.20154869
## 12 Everton 0.09064414
## 13 Bournemouth 0.07688813
## 14 Fulham 0.07016092
## 15 Wolves 0.00000000
## 16 Nott'm Forest -0.42440288
## 17 Brentford -0.54727382
## 18 Luton -1.27204220
## 19 Burnley -1.50094394
## 20 Sheffield United -2.25301390
manual_BT_df %>%
ggplot(aes(x = reorder(Team, -bt_manual_coeffs))) + # Reorder by coefficient descending
geom_segment(aes(x = Team, xend = Team, y = 0, yend = bt_manual_coeffs),
color = "grey", linetype = "dotted") + # Vertical lines from 0 to coefficient
geom_point(aes(y = bt_manual_coeffs), color = "blue", size = 3) + # Highlight points
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + # Rotate team names
labs(title = "Team Strength Coefficients from MLE (BT Model)",
x = "Teams",
y = "Coefficient (Team Strength)") # Add informative labels
This is an improvement from the previous method that uses GLM, as this
prevents teams from having the same results. This also naturally
excludes the home advantage parameter/ does not factor in home
advantage.
manual_BT_df %>% left_join(home_away_win_rates) %>% mutate(win_rate = 0.5*(home_win_rate + away_win_rate))%>% select(Team, bt_manual_coeffs, win_rate) %>%
ggplot(aes(x = win_rate, y = bt_manual_coeffs)) +
geom_point() +
geom_text_repel(aes(label = Team)) +
labs(x = 'Win Rate', y = 'BT Model Coefficients',
title = 'Relationship between win base rate and the Terry Bradley coefficients (No intercept)', subtitle = '2022-2024' ) +
theme_minimal()
## Joining with `by = join_by(Team)`
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
The agreement is now better. In particular, the model correctly spots
that Arsenal has performed slightly better than Liverpool.