We’ll walk through the analysis of the Bundesliga 22-23 football match results. We’ll load the data, preprocess it, calculate Elo ratings, and generate the points for each team in a final standings format.
We start by loading the required libraries, including tidyverse for data manipulation and worldfootballR for accessing football match data.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(worldfootballR)
## Warning: package 'worldfootballR' was built under R version 4.2.3
We load the Bundesliga 2023 data, specifying the country, gender, tier, and season-end year.
Bundesliga2023 <- worldfootballR::fb_match_results(
country = 'GER',
gender = 'M',
tier = "1st",
season_end_year = 2023
)
We examine the structure of the loaded data.
str(Bundesliga2023)
## 'data.frame': 308 obs. of 20 variables:
## $ Competition_Name: chr "Fußball-Bundesliga" "Fußball-Bundesliga" "Fußball-Bundesliga" "Fußball-Bundesliga" ...
## $ Gender : chr "M" "M" "M" "M" ...
## $ Country : chr "GER" "GER" "GER" "GER" ...
## $ Season_End_Year : int 2023 2023 2023 2023 2023 2023 2023 2023 2023 2023 ...
## $ Round : chr "Regular season" "Regular season" "Regular season" "Regular season" ...
## $ Wk : chr "1" "1" "1" "1" ...
## $ Day : chr "Fri" "Sat" "Sat" "Sat" ...
## $ Date : Date, format: "2022-08-05" "2022-08-06" ...
## $ Time : chr "20:30" "15:30" "15:30" "15:30" ...
## $ Home : chr "Eint Frankfurt" "Wolfsburg" "Augsburg" "Union Berlin" ...
## $ HomeGoals : num 1 2 0 3 3 1 1 1 3 0 ...
## $ Home_xG : num 0.9 1 0.7 1 1.9 0.9 1.8 0.8 4.1 0.7 ...
## $ Away : chr "Bayern Munich" "Werder Bremen" "Freiburg" "Hertha BSC" ...
## $ AwayGoals : num 6 2 4 1 1 2 0 1 1 3 ...
## $ Away_xG : num 4 0.8 1.3 0.3 0.1 1.4 1.3 1.6 0.2 3.2 ...
## $ Attendance : num 51500 28015 25310 22012 49659 ...
## $ Venue : chr "Deutsche Bank Park" "Volkswagen Arena" "WWK Arena" "Stadion An der Alten Försterei" ...
## $ Referee : chr "Deniz Aytekin" "Sascha Stegemann" "Bastian Dankert" "Marco Fritz" ...
## $ Notes : chr "" "" "" "" ...
## $ MatchURL : chr "https://fbref.com/en/matches/6a1c18c4/Eintracht-Frankfurt-Bayern-Munich-August-5-2022-Bundesliga" "https://fbref.com/en/matches/1591be13/Wolfsburg-Werder-Bremen-August-6-2022-Bundesliga" "https://fbref.com/en/matches/3c5c48d7/Augsburg-Freiburg-August-6-2022-Bundesliga" "https://fbref.com/en/matches/58e704e1/Union-Berlin-Hertha-BSC-August-6-2022-Bundesliga" ...
We convert the ‘Wk’ column to a numeric data type.
Bundesliga2023$Wk <- as.numeric(Bundesliga2023$Wk)
We rename columns to make them more understandable and descriptive.
Bundesliga2023 <- Bundesliga2023 %>%
rename(
team = Home,
opp = Away,
teamScore = HomeGoals,
oppScore = AwayGoals,
round = Wk,
Match_ID = Round
)
We only want to include the German Regular Season, not the DFB Pokal rounds
Bundesliga2023 <- Bundesliga2023 %>%
filter(Match_ID == "Regular season")
Tidying up the data frame
Bundesliga2023 <- Bundesliga2023 %>%
select('round','team','opp', 'teamScore', 'oppScore')
Setting up the foundation for the Elo Model
Bundesliga2023 <- Bundesliga2023 %>%
mutate(score = ifelse(teamScore > oppScore, 1,
ifelse(teamScore < oppScore, 0, 0.5)))
bn_new <- Bundesliga2023 %>%
mutate(normalized = purrr::map2_chr(team, opp, ~paste(sort(c(.x, .y)), collapse = ""))) %>%
group_by(round, normalized) %>%
summarise(team = dplyr::first(team),
opp = dplyr::first(opp)) %>%
select(-normalized)
## `summarise()` has grouped output by 'round'. You can override using the
## `.groups` argument.
bn_new <- merge(bn_new, Bundesliga2023)
bn_new <- bn_new %>%
arrange(round)
This produces our Elo Ratings for each team.
BN1_elo <- PlayerRatings::elo(
bn_new[c('round', 'team','opp','score')],
init = 2200,
k = 27,
history = T)
Create a new Data frame with all the values included from the Elo Model in the previous section
BN1_elo <- list(
"Dortmund" = list(Rating = 2324, Games = 34, Win = 22, Draw = 5, Loss = 7, Lag = 0),
"Bayern Munich" = list(Rating = 2319, Games = 34, Win = 21, Draw = 8, Loss = 5, Lag = 0),
"RB Leipzig" = list(Rating = 2309, Games = 34, Win = 20, Draw = 6, Loss = 8, Lag = 0),
"Union Berlin" = list(Rating = 2258, Games = 34, Win = 18, Draw = 8, Loss = 8, Lag = 0),
"Freiburg" = list(Rating = 2245, Games = 34, Win = 17, Draw = 8, Loss = 9, Lag = 0),
"Leverkusen" = list(Rating = 2225, Games = 34, Win = 14, Draw = 8, Loss = 12, Lag = 0),
"Wolfsburg" = list(Rating = 2211, Games = 34, Win = 13, Draw = 10, Loss = 11, Lag = 0),
"Eint Frankfurt" = list(Rating = 2210, Games = 34, Win = 13, Draw = 11, Loss = 10, Lag = 0),
"Mainz 05" = list(Rating = 2191, Games = 34, Win = 12, Draw = 10, Loss = 12, Lag = 0),
"Köln" = list(Rating = 2186, Games = 34, Win = 10, Draw = 12, Loss = 12, Lag = 0),
"M'Gladbach" = list(Rating = 2180, Games = 34, Win = 11, Draw = 10, Loss = 13, Lag = 0),
"Bochum" = list(Rating = 2154, Games = 34, Win = 10, Draw = 5, Loss = 19, Lag = 0),
"Stuttgart" = list(Rating = 2150, Games = 34, Win = 7, Draw = 12, Loss = 15, Lag = 0),
"Schalke 04" = list(Rating = 2147, Games = 34, Win = 7, Draw = 10, Loss = 17, Lag = 0),
"Hoffenheim" = list(Rating = 2143, Games = 34, Win = 10, Draw = 6, Loss = 18, Lag = 0),
"Augsburg" = list(Rating = 2122, Games = 34, Win = 9, Draw = 7, Loss = 18, Lag = 0),
"Werder Bremen" = list(Rating = 2116, Games = 34, Win = 10, Draw = 6, Loss = 18, Lag = 0),
"Hertha BSC" = list(Rating = 2109, Games = 34, Win = 7, Draw = 8, Loss = 19, Lag = 0)
)
BN1_elo_DF <- do.call(rbind, lapply(names(BN1_elo), function(team) {
data.frame(
Player = team,
Rating = BN1_elo[[team]]$Rating,
Games = BN1_elo[[team]]$Games,
Win = BN1_elo[[team]]$Win,
Draw = BN1_elo[[team]]$Draw,
Loss = BN1_elo[[team]]$Loss,
Lag = BN1_elo[[team]]$Lag
)
}))
rownames(BN1_elo_DF) <- NULL
BN1_elo_DF <- BN1_elo_DF %>%
select('Player' , 'Rating', 'Games', 'Win', 'Draw', 'Loss')
BN1_elo_DF <- BN1_elo_DF %>%
rename(Team = Player)
BN1_elo_DF$Win = as.numeric(BN1_elo_DF$Win)
BN1_elo_DF$Draw = as.numeric(BN1_elo_DF$Draw)
BN1_elo_DF$Points <- 3*BN1_elo_DF$Win + 1*BN1_elo_DF$Draw
BN1_elo_DF
## Team Rating Games Win Draw Loss Points
## 1 Dortmund 2324 34 22 5 7 71
## 2 Bayern Munich 2319 34 21 8 5 71
## 3 RB Leipzig 2309 34 20 6 8 66
## 4 Union Berlin 2258 34 18 8 8 62
## 5 Freiburg 2245 34 17 8 9 59
## 6 Leverkusen 2225 34 14 8 12 50
## 7 Wolfsburg 2211 34 13 10 11 49
## 8 Eint Frankfurt 2210 34 13 11 10 50
## 9 Mainz 05 2191 34 12 10 12 46
## 10 Köln 2186 34 10 12 12 42
## 11 M'Gladbach 2180 34 11 10 13 43
## 12 Bochum 2154 34 10 5 19 35
## 13 Stuttgart 2150 34 7 12 15 33
## 14 Schalke 04 2147 34 7 10 17 31
## 15 Hoffenheim 2143 34 10 6 18 36
## 16 Augsburg 2122 34 9 7 18 34
## 17 Werder Bremen 2116 34 10 6 18 36
## 18 Hertha BSC 2109 34 7 8 19 29