# Libraries
library(tidyverse)
library(tibble)
library(knitr)
library(kableExtra)

### Importing data
df <- read.csv("C:/Users/matth/OneDrive/Documents/INFO_H510/spi_matches.csv")
### Subsetting to only include the top 5 leagues
df_top_leagues <- df |>
  filter(league %in% c("Barclays Premier League", "French Ligue 1", "Italy Serie A", "Spanish Primera Division", "German Bundesliga"))

Sampling Process

I am looking at trends within the data based on what we see when we take random samples of different data proportions from the dataset. These will be done with replacement, so we can simulate a data collection process of selecting some number or ratio of games across different seasons. This lets us look at how randomness affects any conclusions I may draw from my analysis.

As previously stated, these will be drawn with replacement. At each step, I will draw 4 samples of different fractions of the data, starting with 25%. I will look at goal/goal-related metric trends as well as league and season proportions. These will be compared across samples that take 10%, 25%, and 75% of the data. This will give us a good luck at multiple different sample sizes, leaves the potential for duplicates, and we can explore how sample size affects the summary statistics I will explore.

### Sampling (25%)
set.seed(1427)

# Parameters
sample_frac <- 0.25
n_samples <- 4
sample_size <- floor(sample_frac * nrow(df_top_leagues))

# Empty df for samples
df_samples <- tibble()

# sampling
for (sample_i in 1:n_samples) {
  df_i <- df_top_leagues |>
    sample_n(
      size = sample_size,
      replace = TRUE
    ) |>
    mutate(sample_num = sample_i)
  
  df_samples <- bind_rows(df_samples, df_i)
}

# quick sanity check
df_samples |> count(sample_num)

## # A tibble: 4 × 2
##   sample_num     n
##        <int> <int>
## 1          1  2282
## 2          2  2282
## 3          3  2282
## 4          4  2282

### 10% Sampling
# Parameters
sample_frac <- 0.10
sample_size <- floor(sample_frac * nrow(df_top_leagues))

# Empty dataframe
df_samples_10 <- tibble()

# sampling
for (sample_i in 1:n_samples) {
  df_i <- df_top_leagues |>
    sample_n(
      size = sample_size,
      replace = TRUE
    ) |>
    mutate(sample_num = sample_i)
  
  df_samples_10 <- bind_rows(df_samples_10, df_i)
}

# sanity check
df_samples_10 |> count(sample_num)

## # A tibble: 4 × 2
##   sample_num     n
##        <int> <int>
## 1          1   913
## 2          2   913
## 3          3   913
## 4          4   913

### 10% Sampling
# Parameters
sample_frac <- 0.75
sample_size <- floor(sample_frac * nrow(df_top_leagues))

# Empty dataframe
df_samples_75 <- tibble()

# sampling
for (sample_i in 1:n_samples) {
  df_i <- df_top_leagues |>
    sample_n(
      size = sample_size,
      replace = TRUE
    ) |>
    mutate(sample_num = sample_i)
  
  df_samples_75 <- bind_rows(df_samples_75, df_i)
}

# sanity check
df_samples_75 |> count(sample_num)

## # A tibble: 4 × 2
##   sample_num     n
##        <int> <int>
## 1          1  6847
## 2          2  6847
## 3          3  6847
## 4          4  6847

Here, we take a quick sanity check of the samples to ensure we collected even samples. Across the 10%, 25%, and 75% samples we appear to have even samples in each group.

League and Season Distributions

League and Season Distribution: 25% Sampling

### Check distribution of leagues by sub sample
league_dist <- df_samples |>
  count(sample_num, league) |>
  group_by(sample_num) |>
  mutate(prop = n / sum(n)) |>
  ungroup()

league_dist

## # A tibble: 20 × 4
##    sample_num league                       n  prop
##         <int> <chr>                    <int> <dbl>
##  1          1 Barclays Premier League    477 0.209
##  2          1 French Ligue 1             479 0.210
##  3          1 German Bundesliga          370 0.162
##  4          1 Italy Serie A              483 0.212
##  5          1 Spanish Primera Division   473 0.207
##  6          2 Barclays Premier League    498 0.218
##  7          2 French Ligue 1             462 0.202
##  8          2 German Bundesliga          380 0.167
##  9          2 Italy Serie A              492 0.216
## 10          2 Spanish Primera Division   450 0.197
## 11          3 Barclays Premier League    452 0.198
## 12          3 French Ligue 1             434 0.190
## 13          3 German Bundesliga          424 0.186
## 14          3 Italy Serie A              497 0.218
## 15          3 Spanish Primera Division   475 0.208
## 16          4 Barclays Premier League    454 0.199
## 17          4 French Ligue 1             490 0.215
## 18          4 German Bundesliga          374 0.164
## 19          4 Italy Serie A              478 0.209
## 20          4 Spanish Primera Division   486 0.213

### Distribution plot
ggplot(league_dist, aes(x = league, y = prop, fill = factor(sample_num))) +
  geom_col(position = "dodge") +
  labs(
    title = "League Composition Across Random Subsamples (25%)",
    x = "League",
    y = "Proportion of Matches",
    fill = "Sample"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

On a broad level, the league distribution is pretty similar across the 4 subsamples. There are some small differences, but these can be expected under a random sampling process. The only league that clearly has less matches is the Bundesliga, but there are less Bundesliga games in the full data since they only play 34 matches per season rather than the 38 that the other 4 leagues play. The samples roughly appear to match their actual proportion in the dataset.

Season Distribution: 25% Sampling

### Season Distribution
season_dist <- df_samples |>
  count(sample_num, season) |>
  group_by(sample_num) |>
  mutate(prop = n / sum(n)) |>
  ungroup()

season_dist

## # A tibble: 20 × 4
##    sample_num season     n  prop
##         <int>  <int> <int> <dbl>
##  1          1   2016   451 0.198
##  2          1   2017   494 0.216
##  3          1   2018   454 0.199
##  4          1   2019   426 0.187
##  5          1   2020   457 0.200
##  6          2   2016   464 0.203
##  7          2   2017   458 0.201
##  8          2   2018   434 0.190
##  9          2   2019   464 0.203
## 10          2   2020   462 0.202
## 11          3   2016   465 0.204
## 12          3   2017   456 0.200
## 13          3   2018   430 0.188
## 14          3   2019   448 0.196
## 15          3   2020   483 0.212
## 16          4   2016   441 0.193
## 17          4   2017   472 0.207
## 18          4   2018   470 0.206
## 19          4   2019   450 0.197
## 20          4   2020   449 0.197

ggplot(season_dist, aes(x = season, y = prop, fill = factor(sample_num))) +
  geom_col(position = "dodge") +
  labs(
    title = "Season Composition Across Random Subsamples (25%)",
    x = "Season",
    y = "Proportion of Matches",
    fill = "Sample"
  ) +
  theme_minimal()

Similar to with the leagues sampling, there do not appear to be any seasons with clean over or under-sampled games. The 2017 season seems to be the most sampled of the 5 seasons, with all samples having at least 20% of the games from that season (outside of sample 3 which is at 19.98%), but none have a large oversampling from this season. Otherwise, the proportions within the samples roughly align with the actual proportions of the full dataset (20% for each season).

Leagues: 10% Sampling

league_dist <- df_samples_10 |>
  count(sample_num, league) |>
  group_by(sample_num) |>
  mutate(prop = n / sum(n)) |>
  ungroup()

league_dist

## # A tibble: 20 × 4
##    sample_num league                       n  prop
##         <int> <chr>                    <int> <dbl>
##  1          1 Barclays Premier League    166 0.182
##  2          1 French Ligue 1             190 0.208
##  3          1 German Bundesliga          162 0.177
##  4          1 Italy Serie A              193 0.211
##  5          1 Spanish Primera Division   202 0.221
##  6          2 Barclays Premier League    190 0.208
##  7          2 French Ligue 1             185 0.203
##  8          2 German Bundesliga          142 0.156
##  9          2 Italy Serie A              200 0.219
## 10          2 Spanish Primera Division   196 0.215
## 11          3 Barclays Premier League    200 0.219
## 12          3 French Ligue 1             194 0.212
## 13          3 German Bundesliga          150 0.164
## 14          3 Italy Serie A              177 0.194
## 15          3 Spanish Primera Division   192 0.210
## 16          4 Barclays Premier League    188 0.206
## 17          4 French Ligue 1             169 0.185
## 18          4 German Bundesliga          174 0.191
## 19          4 Italy Serie A              176 0.193
## 20          4 Spanish Primera Division   206 0.226

### Distribution plot
ggplot(league_dist, aes(x = league, y = prop, fill = factor(sample_num))) +
  geom_col(position = "dodge") +
  labs(
    title = "League Composition Across Random Subsamples (10%)",
    x = "League",
    y = "Proportion of Matches",
    fill = "Sample"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

For the 10% sampling, there again don’t appear to be many strong differences in the league proportions. While the differences are a bit larger here than in the 25% sampling, none appear to be too far off the actual proportion. The 3rd sample for the Bundesliga/Premier League are the only 2 that have close to suspect values, with the Bundesliga around 15% and Premier League above 23%, but again these do not appear to be too poor.

Season: 10%

### Season Distribution
season_dist <- df_samples_10 |>
  count(sample_num, season) |>
  group_by(sample_num) |>
  mutate(prop = n / sum(n)) |>
  ungroup()

season_dist

## # A tibble: 20 × 4
##    sample_num season     n  prop
##         <int>  <int> <int> <dbl>
##  1          1   2016   184 0.202
##  2          1   2017   176 0.193
##  3          1   2018   201 0.220
##  4          1   2019   179 0.196
##  5          1   2020   173 0.189
##  6          2   2016   164 0.180
##  7          2   2017   205 0.225
##  8          2   2018   172 0.188
##  9          2   2019   173 0.189
## 10          2   2020   199 0.218
## 11          3   2016   173 0.189
## 12          3   2017   197 0.216
## 13          3   2018   171 0.187
## 14          3   2019   182 0.199
## 15          3   2020   190 0.208
## 16          4   2016   187 0.205
## 17          4   2017   174 0.191
## 18          4   2018   175 0.192
## 19          4   2019   188 0.206
## 20          4   2020   189 0.207

### Distribution plot
ggplot(season_dist, aes(x = season, y = prop, fill = factor(sample_num))) +
  geom_col(position = "dodge") +
  labs(
    title = "Season Composition Across Random Subsamples (10%)",
    x = "Season",
    y = "Proportion of Matches",
    fill = "Sample"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Similar to with leagues here, there are slightly bigger deviations from the expected proportion, but they are not too large. Sample 3 again has 1 potentially “suspect” value, with over 23% of games coming from the 2019 season, but this isn’t too far off the expected 20%.

Leagues: 75%

league_dist <- df_samples_75 |>
  count(sample_num, league) |>
  group_by(sample_num) |>
  mutate(prop = n / sum(n)) |>
  ungroup()

league_dist

## # A tibble: 20 × 4
##    sample_num league                       n  prop
##         <int> <chr>                    <int> <dbl>
##  1          1 Barclays Premier League   1503 0.220
##  2          1 French Ligue 1            1399 0.204
##  3          1 German Bundesliga         1158 0.169
##  4          1 Italy Serie A             1392 0.203
##  5          1 Spanish Primera Division  1395 0.204
##  6          2 Barclays Premier League   1424 0.208
##  7          2 French Ligue 1            1404 0.205
##  8          2 German Bundesliga         1153 0.168
##  9          2 Italy Serie A             1434 0.209
## 10          2 Spanish Primera Division  1432 0.209
## 11          3 Barclays Premier League   1425 0.208
## 12          3 French Ligue 1            1409 0.206
## 13          3 German Bundesliga         1151 0.168
## 14          3 Italy Serie A             1399 0.204
## 15          3 Spanish Primera Division  1463 0.214
## 16          4 Barclays Premier League   1403 0.205
## 17          4 French Ligue 1            1459 0.213
## 18          4 German Bundesliga         1153 0.168
## 19          4 Italy Serie A             1433 0.209
## 20          4 Spanish Primera Division  1399 0.204

### Distribution plot
ggplot(league_dist, aes(x = league, y = prop, fill = factor(sample_num))) +
  geom_col(position = "dodge") +
  labs(
    title = "League Composition Across Random Subsamples (75%)",
    x = "League",
    y = "Proportion of Matches",
    fill = "Sample"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This sample is the closest to what we would expect out of the 3 sampling groups. However, the 25% sampling did a similarly good job. There are no major deviations from expected for any of the 4 samples or 5 leagues.

Season: 75%

### Season Distribution
season_dist <- df_samples_75 |>
  count(sample_num, season) |>
  group_by(sample_num) |>
  mutate(prop = n / sum(n)) |>
  ungroup()

season_dist

## # A tibble: 20 × 4
##    sample_num season     n  prop
##         <int>  <int> <int> <dbl>
##  1          1   2016  1329 0.194
##  2          1   2017  1374 0.201
##  3          1   2018  1422 0.208
##  4          1   2019  1318 0.192
##  5          1   2020  1404 0.205
##  6          2   2016  1340 0.196
##  7          2   2017  1337 0.195
##  8          2   2018  1370 0.200
##  9          2   2019  1399 0.204
## 10          2   2020  1401 0.205
## 11          3   2016  1404 0.205
## 12          3   2017  1342 0.196
## 13          3   2018  1328 0.194
## 14          3   2019  1383 0.202
## 15          3   2020  1390 0.203
## 16          4   2016  1356 0.198
## 17          4   2017  1362 0.199
## 18          4   2018  1389 0.203
## 19          4   2019  1311 0.191
## 20          4   2020  1429 0.209

### Distribution plot
ggplot(season_dist, aes(x = season, y = prop, fill = factor(sample_num))) +
  geom_col(position = "dodge") +
  labs(
    title = "Season Composition Across Random Subsamples (75%)",
    x = "Season",
    y = "Proportion of Matches",
    fill = "Sample"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Just like with leagues, this sample is the closest to what we would expect out of the 3 sampling groups. However, the 25% sampling did a similarly good job. There are no major deviations from expected for any of the 4 samples or 5 seasons.

Goals Scored

# Adding a "sample size" label and compiling
df_all_samples <- bind_rows(
  df_samples_10 |> mutate(sample_size = "10%"),
  df_samples    |> mutate(sample_size = "25%"),
  df_samples_75 |> mutate(sample_size = "75%")
)

# Computing average home and away goals for each subsample
goal_summary_by_sample <- df_all_samples |>
  group_by(sample_size, sample_num) |>
  summarise(
    home_mean_goals = mean(score1, na.rm = TRUE),
    away_mean_goals = mean(score2, na.rm = TRUE),
    n_matches = n(),
    goal_diff = mean(score1 - score2, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(sample_size, sample_num)


# Computing averages for goals scored and score differential across samples
goal_summary_by_sample |>
  mutate(
    home_mean_goals = round(home_mean_goals, 2),
    away_mean_goals = round(away_mean_goals, 2),
    goal_diff = round(goal_diff, 2)
  ) |>
  
  # Cleanly formatted table
  kable(
    caption = "Average Home vs Away Goals by Subsample and Sample Size",
    col.names = c("Sample Size", "Subsample #", "Home Avg Goals", "Away Avg Goals", "Matches", "Home-Away Diff")
  ) |>
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE)

Average Home vs Away Goals by Subsample and Sample Size
Sample Size	Subsample #	Home Avg Goals	Away Avg Goals	Matches	Home-Away Diff
10%	1	1.51	1.26	913	0.25
10%	2	1.53	1.26	913	0.27
10%	3	1.57	1.16	913	0.40
10%	4	1.56	1.24	913	0.32
25%	1	1.53	1.23	2282	0.30
25%	2	1.57	1.27	2282	0.30
25%	3	1.56	1.23	2282	0.32
25%	4	1.59	1.21	2282	0.38
75%	1	1.54	1.23	6847	0.31
75%	2	1.55	1.26	6847	0.29
75%	3	1.56	1.22	6847	0.34
75%	4	1.54	1.25	6847	0.30

# Reshaping for visualization
df_long <- df_all_samples |>
  select(sample_size, sample_num, score1, score2) |>
  pivot_longer(
    cols = c(score1, score2),
    names_to = "location",
    values_to = "goals"
  ) |>
  mutate(location = ifelse(location == "score1", "Home", "Away"))

# Box and whisker plots
ggplot(df_long, aes(x = location, y = goals, fill = location)) +
  geom_boxplot(alpha = 0.7) +
  facet_grid(sample_size ~ sample_num) +
  labs(
    title = "Goal Distributions by Subsample and Sample Size",
    x = "Team Location",
    y = "Goals Scored"
  ) +
  theme_minimal() +
  scale_fill_manual(values = c("Home" = "steelblue", "Away" = "firebrick"))

Much like with the leagues and seasons, there really isn’t a lot of variability for home/away goals scored, or score differential within the 4 samples of each group. The 10% sample size samples had the most variation, with average home goals ranging from 1.52 to 1.58, away goals from 1.19 to 1.25, and score differential from 0.31 to 0.33. Interestingly, the 25% sample size groups had the most variation in score differential. The 4th sample has an average score differential/home advantage of 0.38 goals, while the other 3 samples in the group are 0.3 or 0.32. While not a huge outlier, it is important to note, especially when a majority of soccer matches are decided by 0-2 goals. We must acknowledge the possible influence of sampling variance as a small factor in any trends we notice, especially if we either take a sample of our match dataset or try to apply out 5 season data set to a larger context.

In terms of the medians and boxplot distributions, there are not many differences of note. The 75% sampling seems like it catches more outliers, but has a larger sample size to go with it. We must be vigilant of the presence of outliers in our analyses, especially when we have a few scores where a team scored 6+ goals or the differential is very high.

MK_Week4_DataDive

2026-02-03