Data Dive 4

Brody Funk - Premier League data 2024-25 season

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pl <- read_csv("C:/Users/bfunk/Downloads/E0.csv")
## Rows: 380 Columns: 106
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): Div, Date, HomeTeam, AwayTeam, FTR, HTR, Referee
## dbl  (98): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY,...
## time  (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Using helper. I went with .25 and 5 samples after I realized .2 was too small for my dataset

# change this number, and consider how it affects the sub-sample analysis
sample_frac = 0.25

# number of samples to scrutinize
n_samples = 5

df_samples = tibble()  # empty dataframe to append to

for (sample_i in 1:n_samples) {
  pl_i <- pl |>
    sample_n(size = sample_frac * nrow(pl), replace = TRUE) |>
    mutate(sample_num = sample_i)  # add a column indicating sample number
  
  df_samples = bind_rows(df_samples, pl_i)
}
df_samples
## # A tibble: 475 × 107
##    Div   Date       Time   HomeTeam AwayTeam  FTHG  FTAG FTR    HTHG  HTAG HTR  
##    <chr> <chr>      <time> <chr>    <chr>    <dbl> <dbl> <chr> <dbl> <dbl> <chr>
##  1 E0    17/09/2022 15:00  Newcast… Bournem…     1     1 D         0     0 D    
##  2 E0    11/02/2023 17:30  Bournem… Newcast…     1     1 D         1     1 D    
##  3 E0    22/05/2023 20:00  Newcast… Leicest…     0     0 D         0     0 D    
##  4 E0    22/05/2023 20:00  Newcast… Leicest…     0     0 D         0     0 D    
##  5 E0    26/12/2022 15:00  Everton  Wolves       1     2 A         1     1 D    
##  6 E0    23/04/2023 14:00  Bournem… West Ham     0     4 A         0     3 A    
##  7 E0    28/05/2023 16:30  Crystal… Nott'm …     1     1 D         0     1 A    
##  8 E0    22/04/2023 12:30  Fulham   Leeds        2     1 H         0     0 D    
##  9 E0    01/03/2023 19:45  Arsenal  Everton      4     0 H         2     0 H    
## 10 E0    13/08/2022 15:00  Brighton Newcast…     0     0 D         0     0 D    
## # ℹ 465 more rows
## # ℹ 96 more variables: Referee <chr>, HS <dbl>, AS <dbl>, HST <dbl>, AST <dbl>,
## #   HF <dbl>, AF <dbl>, HC <dbl>, AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>,
## #   AR <dbl>, B365H <dbl>, B365D <dbl>, B365A <dbl>, BWH <dbl>, BWD <dbl>,
## #   BWA <dbl>, IWH <dbl>, IWD <dbl>, IWA <dbl>, PSH <dbl>, PSD <dbl>,
## #   PSA <dbl>, WHH <dbl>, WHD <dbl>, WHA <dbl>, VCH <dbl>, VCD <dbl>,
## #   VCA <dbl>, MaxH <dbl>, MaxD <dbl>, MaxA <dbl>, AvgH <dbl>, AvgD <dbl>, …

A quick preview showing how all of the separate samples split up by result

df_samples |>
  group_by(sample_num, FTR) |>
  summarise(n = n())
## `summarise()` has grouped output by 'sample_num'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 3
## # Groups:   sample_num [5]
##    sample_num FTR       n
##         <int> <chr> <int>
##  1          1 A        27
##  2          1 D        26
##  3          1 H        42
##  4          2 A        26
##  5          2 D        18
##  6          2 H        51
##  7          3 A        32
##  8          3 D        21
##  9          3 H        42
## 10          4 A        26
## 11          4 D        23
## 12          4 H        46
## 13          5 A        30
## 14          5 D        15
## 15          5 H        50

Next was to add some columns I have been using in previous data dives

s1 <- df_samples |> filter(sample_num == 1)
s1 <- s1 |>
  mutate(TG = FTHG + FTAG) |>
  mutate(YC = HY + AY)|>
  mutate(RC = HR + AR)

Looking at how the first subsection of my sample reflects referee goals, something I have been looking into in previous data dives differs when cut by sample. With much less games to go off of it is hard to say that this tells me anything since there are so few instances of any given ref. In categories like these it is best to look at as much data as possible.

s1 |>
  group_by(Referee) |>
  summarise(avg_total_goals = mean(TG))
## # A tibble: 19 × 2
##    Referee      avg_total_goals
##    <chr>                  <dbl>
##  1 A Madley                1.43
##  2 A Marriner              0   
##  3 A Taylor                2.62
##  4 C Kavanagh              1   
##  5 C Pawson                3.43
##  6 D Coote                 2.8 
##  7 D England               3   
##  8 G Scott                 2.5 
##  9 J Brooks                3.67
## 10 J Gillett               3.67
## 11 M Oliver                4.17
## 12 M Salisbury             3.6 
## 13 P Bankes                1.75
## 14 P Tierney               3.1 
## 15 R Jones                 3   
## 16 S Attwell               3.5 
## 17 S Hooper                2.79
## 18 T Bramall               1.25
## 19 T Harrington            4

Looking at how often each result happen within each sample. 4 has by far the highest win rate and 5 with draws.

ftr2 <- df_samples |>
  group_by(sample_num) |>
  summarise(
    home = mean(FTR == "H"),
    draw = mean(FTR == "D"),
    away = mean(FTR == "A"),
  )

ftr2
## # A tibble: 5 × 4
##   sample_num  home  draw  away
##        <int> <dbl> <dbl> <dbl>
## 1          1 0.442 0.274 0.284
## 2          2 0.537 0.189 0.274
## 3          3 0.442 0.221 0.337
## 4          4 0.484 0.242 0.274
## 5          5 0.526 0.158 0.316

Now comparing these results to the full dataset. The samples do an okay job of representing the result data.

ftr3 <- pl |>
  summarise(
    home = mean(FTR == "H"),
    draw = mean(FTR == "D"),
    away = mean(FTR == "A"),
  )

ftr3
## # A tibble: 1 × 3
##    home  draw  away
##   <dbl> <dbl> <dbl>
## 1 0.484 0.229 0.287

Using sapply to isolate the metrics I can quantize I looked at the average for each column in the first sample I created

s1n <- sapply(s1, is.numeric)
s1m <- colMeans(s1[, s1n])
s1m
##        FTHG        FTAG        HTHG        HTAG          HS          AS 
##  1.57894737  1.23157895  0.78947368  0.54736842 13.37894737 11.04210526 
##         HST         AST          HF          AF          HC          AC 
##  4.81052632  3.80000000 10.78947368 10.83157895  4.98947368  4.55789474 
##          HY          AY          HR          AR       B365H       B365D 
##  1.67368421  1.70526316  0.01052632  0.02105263  2.80094737  4.21063158 
##       B365A         BWH         BWD         BWA         IWH         IWD 
##  4.42652632  2.75568421  4.17821053  4.60473684  2.78757895  4.18105263 
##         IWA         PSH         PSD         PSA         WHH         WHD 
##  4.39484211  2.86778947  4.33989474  4.75915789  2.78610526  4.08094737 
##         WHA         VCH         VCD         VCA        MaxH        MaxD 
##  4.64494737  2.79957895  4.05147368  4.67389474  2.96357895  4.52568421 
##        MaxA        AvgH        AvgD        AvgA    B365>2.5    B365<2.5 
##  5.14800000  2.82452632  4.25315789  4.65936842  1.81368421  2.12600000 
##       P>2.5       P<2.5     Max>2.5     Max<2.5     Avg>2.5     Avg<2.5 
##          NA          NA  1.86789474  2.21094737  1.80400000  2.12368421 
##         AHh     B365AHH     B365AHA        PAHH        PAHA      MaxAHH 
## -0.32105263  1.97221053  1.92747368  1.98526316  1.93347368  2.01294737 
##      MaxAHA      AvgAHH      AvgAHA      B365CH      B365CD      B365CA 
##  1.96684211  1.95863158  1.91157895  2.69136842  4.15294737  4.42336842 
##        BWCH        BWCD        BWCA        IWCH        IWCD        IWCA 
##  2.68873684  4.10557895  4.43694737  2.71368421  4.13052632  4.38926316 
##        PSCH        PSCD        PSCA        WHCH        WHCD        WHCA 
##  2.81715789  4.31715789  4.75589474  2.74042105  4.00031579  4.65736842 
##        VCCH        VCCD        VCCA       MaxCH       MaxCD       MaxCA 
##  2.74831579  4.09168421  4.75705263  2.95642105  4.48568421  5.16936842 
##       AvgCH       AvgCD       AvgCA   B365C>2.5   B365C<2.5      PC>2.5 
##  2.76894737  4.22778947  4.64968421  1.82578947  2.13157895          NA 
##      PC<2.5    MaxC>2.5    MaxC<2.5    AvgC>2.5    AvgC<2.5        AHCh 
##          NA  1.90010526  2.25315789  1.81726316  2.13021053 -0.33684211 
##    B365CAHH    B365CAHA       PCAHH       PCAHA     MaxCAHH     MaxCAHA 
##  1.96810526  1.93094737  1.98810526  1.94021053  2.03400000  1.98757895 
##     AvgCAHH     AvgCAHA  sample_num          TG          YC          RC 
##  1.95915789  1.91757895  1.00000000  2.81052632  3.37894737  0.03157895

for comparative purposes I did the same for the entire dataset

pln <- sapply(pl, is.numeric)
plm <- colMeans(pl[, pln])
plm
##        FTHG        FTAG        HTHG        HTAG          HS          AS 
##  1.63421053  1.21842105  0.75789474  0.56315789 13.95263158 11.31052632 
##         HST         AST          HF          AF          HC          AC 
##  4.90789474  3.89473684 10.59736842 10.93157895  5.63684211  4.47105263 
##          HY          AY          HR          AR       B365H       B365D 
##  1.67105263  1.91578947  0.04736842  0.02631579  2.83521053  4.15794737 
##       B365A         BWH         BWD         BWA         IWH         IWD 
##  4.42071053  2.80060526  4.14352632  4.46447368  2.82068421  4.14644737 
##         IWA         PSH         PSD         PSA         WHH         WHD 
##  4.35234211  2.92400000  4.29002632  4.66205263  2.84428947  4.03636842 
##         WHA         VCH         VCD         VCA        MaxH        MaxD 
##  4.53371053  2.85907895  4.02115789  4.62560526  3.01921053  4.45400000 
##        MaxA        AvgH        AvgD        AvgA    B365>2.5    B365<2.5 
##  4.99947368  2.87210526  4.21557895  4.56705263  1.83300000  2.09557895 
##       P>2.5       P<2.5     Max>2.5     Max<2.5     Avg>2.5     Avg<2.5 
##          NA          NA  1.88865789  2.17663158  1.82252632  2.09294737 
##         AHh     B365AHH     B365AHA        PAHH        PAHA      MaxAHH 
## -0.29078947  1.95610526  1.94315789  1.97378947  1.94684211  1.99750000 
##      MaxAHA      AvgAHH      AvgAHA      B365CH      B365CD      B365CA 
##  1.98257895  1.94350000  1.92721053  2.81818421  4.12660526  4.31226316 
##        BWCH        BWCD        BWCA        IWCH        IWCD        IWCA 
##  2.81247368  4.08128947  4.29257895  2.82931579  4.07855263  4.26173684 
##        PSCH        PSCD        PSCA        WHCH        WHCD        WHCA 
##  2.92986842  4.26939474  4.58534211  2.87078947  3.97626316  4.49910526 
##        VCCH        VCCD        VCCA       MaxCH       MaxCD       MaxCA 
##  2.87136842  4.05381579  4.59234211  3.09418421  4.43594737  4.98826316 
##       AvgCH       AvgCD       AvgCA   B365C>2.5   B365C<2.5      PC>2.5 
##  2.88613158  4.18634211  4.49247368  1.84160526  2.10289474          NA 
##      PC<2.5    MaxC>2.5    MaxC<2.5    AvgC>2.5    AvgC<2.5        AHCh 
##          NA  1.92107895  2.22302632  1.83715789  2.10060526 -0.28157895 
##    B365CAHH    B365CAHA       PCAHH       PCAHA     MaxCAHH     MaxCAHA 
##  1.95344737  1.94873684  1.97118421  1.95763158  2.01928947  2.01060526 
##     AvgCAHH     AvgCAHA 
##  1.94276316  1.93344737

For the next graph I set up the calculations to determine the difference between the sample and the actual data.

pl_cols <- names(pl)[pln]
s1_cols <- names(s1)[s1n]
num_cols <- intersect(pl_cols, s1_cols)
plm <- colMeans(pl[, num_cols])
s1m <- colMeans(s1[, num_cols])
dif_means <- plm - s1m
dif_means
##         FTHG         FTAG         HTHG         HTAG           HS           AS 
##  0.055263158 -0.013157895 -0.031578947  0.015789474  0.573684211  0.268421053 
##          HST          AST           HF           AF           HC           AC 
##  0.097368421  0.094736842 -0.192105263  0.100000000  0.647368421 -0.086842105 
##           HY           AY           HR           AR        B365H        B365D 
## -0.002631579  0.210526316  0.036842105  0.005263158  0.034263158 -0.052684211 
##        B365A          BWH          BWD          BWA          IWH          IWD 
## -0.005815789  0.044921053 -0.034684211 -0.140263158  0.033105263 -0.034605263 
##          IWA          PSH          PSD          PSA          WHH          WHD 
## -0.042500000  0.056210526 -0.049868421 -0.097105263  0.058184211 -0.044578947 
##          WHA          VCH          VCD          VCA         MaxH         MaxD 
## -0.111236842  0.059500000 -0.030315789 -0.048289474  0.055631579 -0.071684211 
##         MaxA         AvgH         AvgD         AvgA     B365>2.5     B365<2.5 
## -0.148526316  0.047578947 -0.037578947 -0.092315789  0.019315789 -0.030421053 
##        P>2.5        P<2.5      Max>2.5      Max<2.5      Avg>2.5      Avg<2.5 
##           NA           NA  0.020763158 -0.034315789  0.018526316 -0.030736842 
##          AHh      B365AHH      B365AHA         PAHH         PAHA       MaxAHH 
##  0.030263158 -0.016105263  0.015684211 -0.011473684  0.013368421 -0.015447368 
##       MaxAHA       AvgAHH       AvgAHA       B365CH       B365CD       B365CA 
##  0.015736842 -0.015131579  0.015631579  0.126815789 -0.026342105 -0.111105263 
##         BWCH         BWCD         BWCA         IWCH         IWCD         IWCA 
##  0.123736842 -0.024289474 -0.144368421  0.115631579 -0.051973684 -0.127526316 
##         PSCH         PSCD         PSCA         WHCH         WHCD         WHCA 
##  0.112710526 -0.047763158 -0.170552632  0.130368421 -0.024052632 -0.158263158 
##         VCCH         VCCD         VCCA        MaxCH        MaxCD        MaxCA 
##  0.123052632 -0.037868421 -0.164710526  0.137763158 -0.049736842 -0.181105263 
##        AvgCH        AvgCD        AvgCA    B365C>2.5    B365C<2.5       PC>2.5 
##  0.117184211 -0.041447368 -0.157210526  0.015815789 -0.028684211           NA 
##       PC<2.5     MaxC>2.5     MaxC<2.5     AvgC>2.5     AvgC<2.5         AHCh 
##           NA  0.020973684 -0.030131579  0.019894737 -0.029605263  0.055263158 
##     B365CAHH     B365CAHA        PCAHH        PCAHA      MaxCAHH      MaxCAHA 
## -0.014657895  0.017789474 -0.016921053  0.017421053 -0.014710526  0.023026316 
##      AvgCAHH      AvgCAHA 
## -0.016394737  0.015868421

Most of the betting data was pretty skewed and made the chart unreadable. This is what I landed on as the best way to display the differences between my sample and my actual data. For the most part everything ended up pretty accurate and close so I would say the sample at a broad leveld ddid a good job of representing my data.

cmp <- tibble(col = names(dif_means), diff = as.numeric(dif_means)) |>
  mutate(abs_diff = abs(diff))|>
  slice(1:17)

ggplot(cmp, aes(x = reorder(col, diff), y = diff)) +
  geom_col() +
  coord_flip() +
  labs(x = "Column", y = "Mean(pl) - Mean(pl sample 1)", title = "Difference between full dataset and sample"
       )

I wanted to see how a smaller sample would look

# change this number, and consider how it affects the sub-sample analysis
sample_frac = 0.1

# number of samples to scrutinize
n_samples = 5

df_samples = tibble()  # empty dataframe to append to

for (sample_i in 1:n_samples) {
  pl_i <- pl |>
    sample_n(size = sample_frac * nrow(pl), replace = TRUE) |>
    mutate(sample_num = sample_i)  # add a column indicating sample number
  
  df_samples = bind_rows(df_samples, pl_i)
}

s2.1 <- df_samples |> filter(sample_num == 1)
s2.1 <- s2.1 |>
  mutate(TG = FTHG + FTAG) |>
  mutate(YC = HY + AY)|>
  mutate(RC = HR + AR)
s2.1 |>
  group_by(Referee) |>
  summarise(avg_total_goals = mean(TG))
## # A tibble: 18 × 2
##    Referee      avg_total_goals
##    <chr>                  <dbl>
##  1 A Madley                2.33
##  2 A Taylor                4   
##  3 C Kavanagh              3.4 
##  4 C Pawson                2   
##  5 D Bond                  2   
##  6 D Coote                 1.67
##  7 D England               2   
##  8 J Brooks                4   
##  9 J Gillett               3.67
## 10 M Oliver                2.67
## 11 M Salisbury             2.75
## 12 P Bankes                3   
## 13 P Tierney               3.5 
## 14 R Jones                 4   
## 15 S Attwell               1   
## 16 S Hooper                3   
## 17 T Harrington            1   
## 18 T Robinson              3

I ran through the same code as the previous sample nothing jumped out for me here

ftr4 <- df_samples |>
  group_by(sample_num) |>
  summarise(
    home = mean(FTR == "H"),
    draw = mean(FTR == "D"),
    away = mean(FTR == "A"),
  )

ftr4
## # A tibble: 5 × 4
##   sample_num  home  draw  away
##        <int> <dbl> <dbl> <dbl>
## 1          1 0.5   0.237 0.263
## 2          2 0.395 0.263 0.342
## 3          3 0.474 0.289 0.237
## 4          4 0.526 0.211 0.263
## 5          5 0.474 0.289 0.237

I also used the same sapply method to see how accurate this new subsample is to the full dataset

s2.1n <- sapply(s2.1, is.numeric)
s2.1m <- colMeans(s2.1[, s2.1n])
s2.1m
##        FTHG        FTAG        HTHG        HTAG          HS          AS 
##  1.57894737  1.13157895  0.60526316  0.65789474 13.92105263 10.23684211 
##         HST         AST          HF          AF          HC          AC 
##  5.23684211  3.92105263 11.23684211 10.89473684  7.02631579  3.97368421 
##          HY          AY          HR          AR       B365H       B365D 
##  1.71052632  1.84210526  0.00000000  0.05263158  2.59210526  3.77236842 
##       B365A         BWH         BWD         BWA         IWH         IWD 
##  3.75447368  2.55842105  3.74026316  3.80052632  2.58815789  3.75657895 
##         IWA         PSH         PSD         PSA         WHH         WHD 
##  3.81394737  2.66736842  3.86289474  3.95184211  2.57947368  3.68815789 
##         WHA         VCH         VCD         VCA        MaxH        MaxD 
##  3.81473684  2.59947368  3.63263158  3.93736842  2.73263158  3.99000000 
##        MaxA        AvgH        AvgD        AvgA    B365>2.5    B365<2.5 
##  4.11105263  2.61842105  3.80921053  3.89421053  1.92078947  1.95078947 
##       P>2.5       P<2.5     Max>2.5     Max<2.5     Avg>2.5     Avg<2.5 
##  1.95184211  1.98105263  1.98473684  2.02078947  1.91263158  1.94947368 
##         AHh     B365AHH     B365AHA        PAHH        PAHA      MaxAHH 
## -0.25000000  1.95473684  1.94394737  1.97552632  1.94789474  1.99684211 
##      MaxAHA      AvgAHH      AvgAHA      B365CH      B365CD      B365CA 
##  1.98500000  1.94157895  1.92947368  2.59342105  3.78315789  3.82710526 
##        BWCH        BWCD        BWCA        IWCH        IWCD        IWCA 
##  2.58210526  3.72447368  3.81894737  2.59315789  3.75526316  3.81973684 
##        PSCH        PSCD        PSCA        WHCH        WHCD        WHCA 
##  2.67131579  3.87131579  4.02105263  2.61578947  3.65657895  3.90710526 
##        VCCH        VCCD        VCCA       MaxCH       MaxCD       MaxCA 
##  2.60500000  3.68526316  4.00710526  2.78868421  3.98157895  4.32526316 
##       AvgCH       AvgCD       AvgCA   B365C>2.5   B365C<2.5      PC>2.5 
##  2.63631579  3.81684211  3.93842105  1.90605263  1.96131579  1.94947368 
##      PC<2.5    MaxC>2.5    MaxC<2.5    AvgC>2.5    AvgC<2.5        AHCh 
##  1.98947368  2.00736842  2.06263158  1.91605263  1.95868421 -0.28289474 
##    B365CAHH    B365CAHA       PCAHH       PCAHA     MaxCAHH     MaxCAHA 
##  1.98000000  1.90631579  2.00421053  1.92447368  2.05631579  1.97315789 
##     AvgCAHH     AvgCAHA  sample_num          TG          YC          RC 
##  1.97894737  1.89763158  1.00000000  2.71052632  3.55263158  0.05263158
s2.1_cols <- names(s2.1)[s2.1n]
num_cols <- intersect(pl_cols, s2.1_cols)
s2.1m <- colMeans(s2.1[, num_cols])
dif2_means <- plm - s2.1m
cmp2 <- tibble(col = names(dif2_means), diff = as.numeric(dif2_means)) |>
  mutate(abs_diff = abs(diff))|>
  slice(1:17)
cmp2 <- tibble(col = names(dif2_means), diff = as.numeric(dif2_means)) |>
  mutate(abs_diff = abs(diff))|>
  slice(1:17)

ggplot(cmp2, aes(x = reorder(col, diff), y = diff)) +
  geom_col() +
  coord_flip() +
  labs(x = "Column", y = "Mean(pl) - Mean(pl sample 2.1)", title = "Difference between full dataset and sample"
       )

As I would expect the averages from the smaller sample are a little more off but I also found it was a decent representation of my data. For that reason I did not feel like I needed to go up and that I am happy with .25 and 5 sub samples.