# Libraries
library(tidyverse)
library(tibble)
library(knitr)
library(kableExtra)
### Importing data
df <- read.csv("C:/Users/matth/OneDrive/Documents/INFO_H510/spi_matches.csv")
### Subsetting to only include the top 5 leagues
df_top_leagues <- df |>
filter(league %in% c("Barclays Premier League", "French Ligue 1", "Italy Serie A", "Spanish Primera Division", "German Bundesliga"))
I am looking at trends within the data based on what we see when we take random samples of different data proportions from the dataset. These will be done with replacement, so we can simulate a data collection process of selecting some number or ratio of games across different seasons. This lets us look at how randomness affects any conclusions I may draw from my analysis.
As previously stated, these will be drawn with replacement. At each step, I will draw 4 samples of different fractions of the data, starting with 25%. I will look at goal/goal-related metric trends as well as league and season proportions. These will be compared across samples that take 10%, 25%, and 75% of the data. This will give us a good luck at multiple different sample sizes, leaves the potential for duplicates, and we can explore how sample size affects the summary statistics I will explore.
### Sampling (25%)
set.seed(1427)
# Parameters
sample_frac <- 0.25
n_samples <- 4
sample_size <- floor(sample_frac * nrow(df_top_leagues))
# Empty df for samples
df_samples <- tibble()
# sampling
for (sample_i in 1:n_samples) {
df_i <- df_top_leagues |>
sample_n(
size = sample_size,
replace = TRUE
) |>
mutate(sample_num = sample_i)
df_samples <- bind_rows(df_samples, df_i)
}
# quick sanity check
df_samples |> count(sample_num)
## # A tibble: 4 Ă— 2
## sample_num n
## <int> <int>
## 1 1 2282
## 2 2 2282
## 3 3 2282
## 4 4 2282
### 10% Sampling
# Parameters
sample_frac <- 0.10
sample_size <- floor(sample_frac * nrow(df_top_leagues))
# Empty dataframe
df_samples_10 <- tibble()
# sampling
for (sample_i in 1:n_samples) {
df_i <- df_top_leagues |>
sample_n(
size = sample_size,
replace = TRUE
) |>
mutate(sample_num = sample_i)
df_samples_10 <- bind_rows(df_samples_10, df_i)
}
# sanity check
df_samples_10 |> count(sample_num)
## # A tibble: 4 Ă— 2
## sample_num n
## <int> <int>
## 1 1 913
## 2 2 913
## 3 3 913
## 4 4 913
### 10% Sampling
# Parameters
sample_frac <- 0.75
sample_size <- floor(sample_frac * nrow(df_top_leagues))
# Empty dataframe
df_samples_75 <- tibble()
# sampling
for (sample_i in 1:n_samples) {
df_i <- df_top_leagues |>
sample_n(
size = sample_size,
replace = TRUE
) |>
mutate(sample_num = sample_i)
df_samples_75 <- bind_rows(df_samples_75, df_i)
}
# sanity check
df_samples_75 |> count(sample_num)
## # A tibble: 4 Ă— 2
## sample_num n
## <int> <int>
## 1 1 6847
## 2 2 6847
## 3 3 6847
## 4 4 6847
Here, we take a quick sanity check of the samples to ensure we collected even samples. Across the 10%, 25%, and 75% samples we appear to have even samples in each group.
### Check distribution of leagues by sub sample
league_dist <- df_samples |>
count(sample_num, league) |>
group_by(sample_num) |>
mutate(prop = n / sum(n)) |>
ungroup()
league_dist
## # A tibble: 20 Ă— 4
## sample_num league n prop
## <int> <chr> <int> <dbl>
## 1 1 Barclays Premier League 477 0.209
## 2 1 French Ligue 1 479 0.210
## 3 1 German Bundesliga 370 0.162
## 4 1 Italy Serie A 483 0.212
## 5 1 Spanish Primera Division 473 0.207
## 6 2 Barclays Premier League 498 0.218
## 7 2 French Ligue 1 462 0.202
## 8 2 German Bundesliga 380 0.167
## 9 2 Italy Serie A 492 0.216
## 10 2 Spanish Primera Division 450 0.197
## 11 3 Barclays Premier League 452 0.198
## 12 3 French Ligue 1 434 0.190
## 13 3 German Bundesliga 424 0.186
## 14 3 Italy Serie A 497 0.218
## 15 3 Spanish Primera Division 475 0.208
## 16 4 Barclays Premier League 454 0.199
## 17 4 French Ligue 1 490 0.215
## 18 4 German Bundesliga 374 0.164
## 19 4 Italy Serie A 478 0.209
## 20 4 Spanish Primera Division 486 0.213
### Distribution plot
ggplot(league_dist, aes(x = league, y = prop, fill = factor(sample_num))) +
geom_col(position = "dodge") +
labs(
title = "League Composition Across Random Subsamples (25%)",
x = "League",
y = "Proportion of Matches",
fill = "Sample"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
On a broad level, the league distribution is pretty similar across the 4 subsamples. There are some small differences, but these can be expected under a random sampling process. The only league that clearly has less matches is the Bundesliga, but there are less Bundesliga games in the full data since they only play 34 matches per season rather than the 38 that the other 4 leagues play. The samples roughly appear to match their actual proportion in the dataset.
### Season Distribution
season_dist <- df_samples |>
count(sample_num, season) |>
group_by(sample_num) |>
mutate(prop = n / sum(n)) |>
ungroup()
season_dist
## # A tibble: 20 Ă— 4
## sample_num season n prop
## <int> <int> <int> <dbl>
## 1 1 2016 451 0.198
## 2 1 2017 494 0.216
## 3 1 2018 454 0.199
## 4 1 2019 426 0.187
## 5 1 2020 457 0.200
## 6 2 2016 464 0.203
## 7 2 2017 458 0.201
## 8 2 2018 434 0.190
## 9 2 2019 464 0.203
## 10 2 2020 462 0.202
## 11 3 2016 465 0.204
## 12 3 2017 456 0.200
## 13 3 2018 430 0.188
## 14 3 2019 448 0.196
## 15 3 2020 483 0.212
## 16 4 2016 441 0.193
## 17 4 2017 472 0.207
## 18 4 2018 470 0.206
## 19 4 2019 450 0.197
## 20 4 2020 449 0.197
ggplot(season_dist, aes(x = season, y = prop, fill = factor(sample_num))) +
geom_col(position = "dodge") +
labs(
title = "Season Composition Across Random Subsamples (25%)",
x = "Season",
y = "Proportion of Matches",
fill = "Sample"
) +
theme_minimal()
Similar to with the leagues sampling, there do not appear to be any seasons with clean over or under-sampled games. The 2017 season seems to be the most sampled of the 5 seasons, with all samples having at least 20% of the games from that season (outside of sample 3 which is at 19.98%), but none have a large oversampling from this season. Otherwise, the proportions within the samples roughly align with the actual proportions of the full dataset (20% for each season).
league_dist <- df_samples_10 |>
count(sample_num, league) |>
group_by(sample_num) |>
mutate(prop = n / sum(n)) |>
ungroup()
league_dist
## # A tibble: 20 Ă— 4
## sample_num league n prop
## <int> <chr> <int> <dbl>
## 1 1 Barclays Premier League 166 0.182
## 2 1 French Ligue 1 190 0.208
## 3 1 German Bundesliga 162 0.177
## 4 1 Italy Serie A 193 0.211
## 5 1 Spanish Primera Division 202 0.221
## 6 2 Barclays Premier League 190 0.208
## 7 2 French Ligue 1 185 0.203
## 8 2 German Bundesliga 142 0.156
## 9 2 Italy Serie A 200 0.219
## 10 2 Spanish Primera Division 196 0.215
## 11 3 Barclays Premier League 200 0.219
## 12 3 French Ligue 1 194 0.212
## 13 3 German Bundesliga 150 0.164
## 14 3 Italy Serie A 177 0.194
## 15 3 Spanish Primera Division 192 0.210
## 16 4 Barclays Premier League 188 0.206
## 17 4 French Ligue 1 169 0.185
## 18 4 German Bundesliga 174 0.191
## 19 4 Italy Serie A 176 0.193
## 20 4 Spanish Primera Division 206 0.226
### Distribution plot
ggplot(league_dist, aes(x = league, y = prop, fill = factor(sample_num))) +
geom_col(position = "dodge") +
labs(
title = "League Composition Across Random Subsamples (10%)",
x = "League",
y = "Proportion of Matches",
fill = "Sample"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
For the 10% sampling, there again don’t appear to be many strong differences in the league proportions. While the differences are a bit larger here than in the 25% sampling, none appear to be too far off the actual proportion. The 3rd sample for the Bundesliga/Premier League are the only 2 that have close to suspect values, with the Bundesliga around 15% and Premier League above 23%, but again these do not appear to be too poor.
### Season Distribution
season_dist <- df_samples_10 |>
count(sample_num, season) |>
group_by(sample_num) |>
mutate(prop = n / sum(n)) |>
ungroup()
season_dist
## # A tibble: 20 Ă— 4
## sample_num season n prop
## <int> <int> <int> <dbl>
## 1 1 2016 184 0.202
## 2 1 2017 176 0.193
## 3 1 2018 201 0.220
## 4 1 2019 179 0.196
## 5 1 2020 173 0.189
## 6 2 2016 164 0.180
## 7 2 2017 205 0.225
## 8 2 2018 172 0.188
## 9 2 2019 173 0.189
## 10 2 2020 199 0.218
## 11 3 2016 173 0.189
## 12 3 2017 197 0.216
## 13 3 2018 171 0.187
## 14 3 2019 182 0.199
## 15 3 2020 190 0.208
## 16 4 2016 187 0.205
## 17 4 2017 174 0.191
## 18 4 2018 175 0.192
## 19 4 2019 188 0.206
## 20 4 2020 189 0.207
### Distribution plot
ggplot(season_dist, aes(x = season, y = prop, fill = factor(sample_num))) +
geom_col(position = "dodge") +
labs(
title = "Season Composition Across Random Subsamples (10%)",
x = "Season",
y = "Proportion of Matches",
fill = "Sample"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Similar to with leagues here, there are slightly bigger deviations from the expected proportion, but they are not too large. Sample 3 again has 1 potentially “suspect” value, with over 23% of games coming from the 2019 season, but this isn’t too far off the expected 20%.
league_dist <- df_samples_75 |>
count(sample_num, league) |>
group_by(sample_num) |>
mutate(prop = n / sum(n)) |>
ungroup()
league_dist
## # A tibble: 20 Ă— 4
## sample_num league n prop
## <int> <chr> <int> <dbl>
## 1 1 Barclays Premier League 1503 0.220
## 2 1 French Ligue 1 1399 0.204
## 3 1 German Bundesliga 1158 0.169
## 4 1 Italy Serie A 1392 0.203
## 5 1 Spanish Primera Division 1395 0.204
## 6 2 Barclays Premier League 1424 0.208
## 7 2 French Ligue 1 1404 0.205
## 8 2 German Bundesliga 1153 0.168
## 9 2 Italy Serie A 1434 0.209
## 10 2 Spanish Primera Division 1432 0.209
## 11 3 Barclays Premier League 1425 0.208
## 12 3 French Ligue 1 1409 0.206
## 13 3 German Bundesliga 1151 0.168
## 14 3 Italy Serie A 1399 0.204
## 15 3 Spanish Primera Division 1463 0.214
## 16 4 Barclays Premier League 1403 0.205
## 17 4 French Ligue 1 1459 0.213
## 18 4 German Bundesliga 1153 0.168
## 19 4 Italy Serie A 1433 0.209
## 20 4 Spanish Primera Division 1399 0.204
### Distribution plot
ggplot(league_dist, aes(x = league, y = prop, fill = factor(sample_num))) +
geom_col(position = "dodge") +
labs(
title = "League Composition Across Random Subsamples (75%)",
x = "League",
y = "Proportion of Matches",
fill = "Sample"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This sample is the closest to what we would expect out of the 3 sampling groups. However, the 25% sampling did a similarly good job. There are no major deviations from expected for any of the 4 samples or 5 leagues.
### Season Distribution
season_dist <- df_samples_75 |>
count(sample_num, season) |>
group_by(sample_num) |>
mutate(prop = n / sum(n)) |>
ungroup()
season_dist
## # A tibble: 20 Ă— 4
## sample_num season n prop
## <int> <int> <int> <dbl>
## 1 1 2016 1329 0.194
## 2 1 2017 1374 0.201
## 3 1 2018 1422 0.208
## 4 1 2019 1318 0.192
## 5 1 2020 1404 0.205
## 6 2 2016 1340 0.196
## 7 2 2017 1337 0.195
## 8 2 2018 1370 0.200
## 9 2 2019 1399 0.204
## 10 2 2020 1401 0.205
## 11 3 2016 1404 0.205
## 12 3 2017 1342 0.196
## 13 3 2018 1328 0.194
## 14 3 2019 1383 0.202
## 15 3 2020 1390 0.203
## 16 4 2016 1356 0.198
## 17 4 2017 1362 0.199
## 18 4 2018 1389 0.203
## 19 4 2019 1311 0.191
## 20 4 2020 1429 0.209
### Distribution plot
ggplot(season_dist, aes(x = season, y = prop, fill = factor(sample_num))) +
geom_col(position = "dodge") +
labs(
title = "Season Composition Across Random Subsamples (75%)",
x = "Season",
y = "Proportion of Matches",
fill = "Sample"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Just like with leagues, this sample is the closest to what we would expect out of the 3 sampling groups. However, the 25% sampling did a similarly good job. There are no major deviations from expected for any of the 4 samples or 5 seasons.
# Adding a "sample size" label and compiling
df_all_samples <- bind_rows(
df_samples_10 |> mutate(sample_size = "10%"),
df_samples |> mutate(sample_size = "25%"),
df_samples_75 |> mutate(sample_size = "75%")
)
# Computing average home and away goals for each subsample
goal_summary_by_sample <- df_all_samples |>
group_by(sample_size, sample_num) |>
summarise(
home_mean_goals = mean(score1, na.rm = TRUE),
away_mean_goals = mean(score2, na.rm = TRUE),
n_matches = n(),
goal_diff = mean(score1 - score2, na.rm = TRUE),
.groups = "drop"
) |>
arrange(sample_size, sample_num)
# Computing averages for goals scored and score differential across samples
goal_summary_by_sample |>
mutate(
home_mean_goals = round(home_mean_goals, 2),
away_mean_goals = round(away_mean_goals, 2),
goal_diff = round(goal_diff, 2)
) |>
# Cleanly formatted table
kable(
caption = "Average Home vs Away Goals by Subsample and Sample Size",
col.names = c("Sample Size", "Subsample #", "Home Avg Goals", "Away Avg Goals", "Matches", "Home-Away Diff")
) |>
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(0, bold = TRUE)
| Sample Size | Subsample # | Home Avg Goals | Away Avg Goals | Matches | Home-Away Diff |
|---|---|---|---|---|---|
| 10% | 1 | 1.51 | 1.26 | 913 | 0.25 |
| 10% | 2 | 1.53 | 1.26 | 913 | 0.27 |
| 10% | 3 | 1.57 | 1.16 | 913 | 0.40 |
| 10% | 4 | 1.56 | 1.24 | 913 | 0.32 |
| 25% | 1 | 1.53 | 1.23 | 2282 | 0.30 |
| 25% | 2 | 1.57 | 1.27 | 2282 | 0.30 |
| 25% | 3 | 1.56 | 1.23 | 2282 | 0.32 |
| 25% | 4 | 1.59 | 1.21 | 2282 | 0.38 |
| 75% | 1 | 1.54 | 1.23 | 6847 | 0.31 |
| 75% | 2 | 1.55 | 1.26 | 6847 | 0.29 |
| 75% | 3 | 1.56 | 1.22 | 6847 | 0.34 |
| 75% | 4 | 1.54 | 1.25 | 6847 | 0.30 |
# Reshaping for visualization
df_long <- df_all_samples |>
select(sample_size, sample_num, score1, score2) |>
pivot_longer(
cols = c(score1, score2),
names_to = "location",
values_to = "goals"
) |>
mutate(location = ifelse(location == "score1", "Home", "Away"))
# Box and whisker plots
ggplot(df_long, aes(x = location, y = goals, fill = location)) +
geom_boxplot(alpha = 0.7) +
facet_grid(sample_size ~ sample_num) +
labs(
title = "Goal Distributions by Subsample and Sample Size",
x = "Team Location",
y = "Goals Scored"
) +
theme_minimal() +
scale_fill_manual(values = c("Home" = "steelblue", "Away" = "firebrick"))
Much like with the leagues and seasons, there really isn’t a lot of variability for home/away goals scored, or score differential within the 4 samples of each group. The 10% sample size samples had the most variation, with average home goals ranging from 1.52 to 1.58, away goals from 1.19 to 1.25, and score differential from 0.31 to 0.33. Interestingly, the 25% sample size groups had the most variation in score differential. The 4th sample has an average score differential/home advantage of 0.38 goals, while the other 3 samples in the group are 0.3 or 0.32. While not a huge outlier, it is important to note, especially when a majority of soccer matches are decided by 0-2 goals. We must acknowledge the possible influence of sampling variance as a small factor in any trends we notice, especially if we either take a sample of our match dataset or try to apply out 5 season data set to a larger context.
In terms of the medians and boxplot distributions, there are not many differences of note. The 75% sampling seems like it catches more outliers, but has a larger sample size to go with it. We must be vigilant of the presence of outliers in our analyses, especially when we have a few scores where a team scored 6+ goals or the differential is very high.