### Packages
library(tidyverse)
library(knitr)
library(lubridate)
library(DT)
library(plotly)
### Importing data
df <- read.csv("C:/Users/matth/OneDrive/Documents/INFO_H510/spi_matches.csv")
### Subsetting to only include the top 5 leagues
df_top_leagues <- df |>
filter(league %in% c("Barclays Premier League", "French Ligue 1", "Italy Serie A", "Spanish Primera Division", "German Bundesliga"))
In this section, we explore how match characteristics vary when
grouping the data in different ways.
Each grouping summarizes match-level outcomes and allows us to reason
about frequency, probability, and rarity of different match types.
First, we will look at groupings by league and explore how many matches occur in each league throughout the dataset.
### Grouped data frame by league
league_summary <- df_top_leagues |>
# Group by league
group_by(league) |>
summarise(
matches = n(),
# Average and median total goals
avg_total_goals = mean(score1 + score2, na.rm = TRUE),
median_total_goals = median(score1 + score2, na.rm = TRUE)
) |>
arrange(matches) |>
# Add probabilities
mutate(probability = matches / sum(matches))
league_summary
## # A tibble: 5 × 5
## league matches avg_total_goals median_total_goals probability
## <chr> <int> <dbl> <dbl> <dbl>
## 1 German Bundesliga 1530 3.02 3 0.168
## 2 Barclays Premier League 1900 2.74 3 0.208
## 3 French Ligue 1 1900 2.64 3 0.208
## 4 Italy Serie A 1900 2.88 3 0.208
## 5 Spanish Primera Divisi… 1900 2.64 2 0.208
### Plot the number of matches for each league with counts and color
ggplot(league_summary, aes(x = reorder(league, matches), y = matches, fill = league)) +
geom_col() +
geom_text(aes(label = matches),
hjust = -0.1, # nudges the text slightly past the bar
size = 3) +
coord_flip() +
labs(
title = "Number of Matches by League",
x = "League",
y = "Match Count",
fill = "League"
) +
theme_minimal() +
theme(legend.position = "none") # optional: hides the redundant legend
From the table and plot, we can see that the Primera Division, Serie A, Ligue 1, and Premier League have the same amount of games (1900). The Bundesliga has only 1530 games, though. This is because the Bundesliga has less teams (18) than the other 4 leagues (20), so they play 4 less games in a season. Thus, the probability that we select a game from one of the other 4 leagues is around .208, while the probability we select a Bundesliga game is 0.168.
Hypothesis: The Bundesliga has less games in the dataset because there are less teams that play in the league in each season.
Next, we will look at grouping by what month the match occurs in.
### Group by month of the match
month_summary <- df_top_leagues |>
# Extract month
mutate(month = month(date, label = TRUE)) |>
# Group by month of the match
group_by(month) |>
summarise(
matches = n(),
avg_total_goals = mean(score1 + score2, na.rm = TRUE)
) |>
arrange(matches) |>
# Probabilities
mutate(probability = matches / sum(matches))
month_summary
## # A tibble: 12 × 4
## month matches avg_total_goals probability
## <ord> <int> <dbl> <dbl>
## 1 Jun 152 2.77 0.0166
## 2 Jul 211 2.88 0.0231
## 3 Aug 514 2.67 0.0563
## 4 Mar 755 2.77 0.0827
## 5 May 756 3.08 0.0828
## 6 Nov 824 2.84 0.0903
## 7 Oct 865 2.78 0.0947
## 8 Jan 920 2.73 0.101
## 9 Sep 942 2.77 0.103
## 10 Apr 1022 2.79 0.112
## 11 Feb 1026 2.66 0.112
## 12 Dec 1143 2.71 0.125
### Plot of match frequency per month
ggplot(month_summary, aes(x = month, y = matches, fill = matches)) +
geom_col() +
# Add number labels
geom_text(aes(label = matches),
vjust = -0.5, size = 3) + # counts above bars
# Gradient for match count
scale_fill_gradient(low = "lightblue", high = "steelblue") + # gradient by frequency
labs(
title = "Match Frequency by Month",
x = "Month",
y = "Match Count",
fill = "Matches"
) +
theme_minimal()
From the visuals, we can see that the least amount of games occur in the summer months, while the most occur in December. This is interesting, though, because all 5 leagues generally run from mid-August to mid-May, so one would not expect there to be any games in June or July. However, when the COVID pandemic hit, the last third to quarter of each league’s games were postponed until the summer. In terms of the high quantity of December games, most leagues generally have a higher number of games around the “festive” period, where teams may play league games 2-3 times a week rather than just on the weekends.
Hypothesis: The reason for the existence of games in June and July is because of the COVID pandemic, so the only June/July games should be in 2020. Also, the reason December has more games is due to higher fixture congestion, so December should have the highest amount of “double match” weeks.
# Reshape Home/Away
importance_long <- df_top_leagues |>
select(league, importance1, importance2, score1, score2) |>
# Rename as home and away instead of team1 and team2
mutate(
home_away = "Home",
importance = importance1,
goals = score1
) |>
bind_rows(
df_top_leagues |>
mutate(
home_away = "Away",
importance = importance2,
goals = score2
) |>
select(league, home_away, importance, goals)
) |>
filter(!is.na(importance))
# Bin importance into 5 intervals (0-20, 20-40, etc.)
importance_long <- importance_long |>
mutate(
importance_bin = cut(
importance,
breaks = seq(0, 100, 20),
include.lowest = TRUE,
right = FALSE,
labels = c("0-20", "20-40", "40-60", "60-80", "80-100")
)
)
# Group by bin and home/away
importance_summary <- importance_long |>
# Grouping by importance and home/away
group_by(importance_bin, home_away) |>
summarise(
matches = n(),
avg_goals = mean(goals, na.rm = TRUE),
median_goals = median(goals, na.rm = TRUE),
.groups = "drop"
) |>
arrange(importance_bin) |>
# Probabilities
mutate(probability = matches / sum(matches))
importance_summary
## # A tibble: 10 × 6
## importance_bin home_away matches avg_goals median_goals probability
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 0-20 Away 3050 1.17 1 0.173
## 2 0-20 Home 2953 1.48 1 0.167
## 3 20-40 Away 2518 1.09 1 0.143
## 4 20-40 Home 2481 1.37 1 0.140
## 5 40-60 Away 1824 1.26 1 0.103
## 6 40-60 Home 1861 1.46 1 0.105
## 7 60-80 Away 984 1.57 1 0.0557
## 8 60-80 Home 1015 1.99 2 0.0575
## 9 80-100 Away 455 1.66 1 0.0258
## 10 80-100 Home 521 2.11 2 0.0295
# Bar chart of match counts by importance bins
ggplot(importance_summary, aes(x = importance_bin, y = matches, fill = home_away)) +
# separate Home vs Away
geom_col(position = position_dodge(width = 0.8)) +
geom_text(aes(label = matches),
position = position_dodge(width = 0.8),
vjust = -0.5, size = 3) +
labs(
title = "Number of Matches by Importance Bin and Team Location",
x = "Match Importance Bin",
y = "Match Count",
fill = "Team Location"
) +
# Color by home/away
scale_fill_manual(values = c("Home" = "steelblue", "Away" = "salmon")) +
theme_minimal()
From the plot, we can see that the highest frequency of matches occurred where the match importance was between 0 and 20, indicating that we are most likely to select a game that is generally “unimportant” over any other bin of match importance. The most important (80-100) games are the least frequent. Another interesting trend here is that the average goals for the more important games is generally higher for the most important games, and increases as importance increases for both the home and away team.
Hypothesis: Match Importance is a significant variable in terms of predicting how many goals a team will score in a match. The slope for this parameter would be positive (increase match importance = increase goals scored).
Next, we will look at what teams are appearing in each league. Obviously, there will be many teams that never play in a certain league, as they are from a different country. What will be more interesting is seeing the teams that have been in the league in all 5 seasons or only 4, 3, 2, or 1 of the seasons due to promotion/relegation.
# Count seasons each team appeared in each league
team_league_summary <- df_top_leagues |>
select(season, league, team1) |>
distinct() |> # unique team-season pairs
group_by(league, team1) |>
summarise(
seasons_played = n(),
.groups = "drop"
) |>
mutate(
category = case_when(
seasons_played == 5 ~ "5 Seasons",
seasons_played == 4 ~ "4 Seasons",
seasons_played == 3 ~ "3 Seasons",
seasons_played == 2 ~ "2 Seasons",
seasons_played == 1 ~ "1 Season"
)
)
# Create list of all teams (home + away) for missing
all_teams <- unique(c(df_top_leagues$team1, df_top_leagues$team2))
# Find teams that never appeared in a league
missing_teams <- expand_grid(
league = unique(df_top_leagues$league),
team1 = all_teams
) |>
anti_join(team_league_summary, by = c("league", "team1")) |>
mutate(category = "0 Seasons")
# Combine
team_league_full <- bind_rows(team_league_summary, missing_teams)
### Build a temporary "untidy" dataframe for a table visualization
team_league_wide <- team_league_full |>
group_by(league, category) |>
summarise(
teams = paste(head(team1, 20), collapse = ", "), # cap at 20 teams per cell
.groups = "drop"
) |>
pivot_wider(
names_from = category,
values_from = teams
) |>
arrange(league)
# Table showing league x team combinations
datatable(
team_league_wide,
filter = "top",
options = list(pageLength = 5),
rownames = FALSE
)
### Interactive Bar plot to see count with teams that appear in the league 0, 1, 2, ... times
# Get counts
team_league_counts <- team_league_full |>
group_by(league, category) |>
summarise(
n_teams = n(),
.groups = "drop"
)
# Unique leagues
leagues <- unique(team_league_counts$league)
# Initialize plotly object
p <- plot_ly()
# Add one trace per league
for (i in seq_along(leagues)) {
league_data <- team_league_counts |>
filter(league == leagues[i])
p <- add_trace(
p,
x = league_data$category,
y = league_data$n_teams,
type = 'bar',
name = leagues[i],
text = league_data$n_teams, # counts
textposition = 'outside', # force outside bars
insidetextanchor = 'middle', # ensure centered
textfont = list(size = 12),
marker = list(color = c("steelblue", "forestgreen", "gold", "orange", "red", "grey")),
visible = ifelse(i == 1, TRUE, FALSE)
)
}
# Create dropdown buttons for leagues
buttons <- lapply(seq_along(leagues), function(i) {
list(
method = "update",
args = list(
list(visible = sapply(seq_along(leagues), function(j) j == i))
),
label = leagues[i]
)
})
# Layout with dropdown
p <- layout(
p,
title = "Number of Teams per Participation Category",
xaxis = list(title = "Seasons Appeared"),
yaxis = list(title = "Number of Teams"),
updatemenus = list(
list(
y = 1.1,
buttons = buttons
)
),
showlegend = FALSE
)
p
As expected, most of the leagues have the highest count of teams in the “never appeared” category, since these teams would be playing in another country’s league. Each of the leagues have 6 or less teams in the 1-4 seasons category, with 12 to 15 teams in the “all 5 seasons” category. The Premier League and Primera Division have the most even distribution among the teams not in the 0 or 5 seasons category, and interestingly no team has appeared only 4 times in the Serie A among these 5 seasons.
It would be very interesting to explore win/point totals with the teams that only appear in the league in 1-4 seasons, for the seasons they were in the league. I would hypothesize that the teams in the 1 season category (e.g. Cardiff City) would generally have less points per season than those in the other categories (e.g. Watford). Specifically, I would hypothesize that in the season the teams in the other categories got relegated (if applicable), the teams in the 1 season category would have fewer wins/points due to greater team quality differences between them and the other teams in the league. My logic is that if they were “better”, they would be in the league more. I would be interested to see if that holds.