datadive2

Data dive week 3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

pl <- read_csv("C:/Users/bfunk/Downloads/E0.csv")

## Rows: 380 Columns: 106
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): Div, Date, HomeTeam, AwayTeam, FTR, HTR, Referee
## dbl  (98): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY,...
## time  (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

pl <- pl |>
  mutate(TG = FTHG + FTAG) |>
  mutate(YC = HY + AY)|>
  mutate(RC = HR + AR)

The referee column seemed like an interesting column based on last weeks data dive, I was interested to see how much consistency there was between refs and if they really impact how matches play out. The referee column was my first grouping. I compared the stats that they are most likely to influence

ref_stats <- pl |>
  group_by(Referee) |>
  summarise(matches = n(),
            avg_total_goals = mean(TG),
            avg_yellow_cards = mean(YC),
            avg_red_cards = mean(RC))
ref_stats

## # A tibble: 23 × 5
##    Referee    matches avg_total_goals avg_yellow_cards avg_red_cards
##    <chr>        <int>           <dbl>            <dbl>         <dbl>
##  1 A Madley        24            2.88             3.46        0.125 
##  2 A Marriner      13            3.23             3.54        0.154 
##  3 A Taylor        30            2.9              3.77        0.1   
##  4 C Kavanagh      13            2.69             3           0     
##  5 C Pawson        21            2.67             3.76        0.0476
##  6 D Bond           4            2                4           0.25  
##  7 D Coote         21            2.95             3.67        0.143 
##  8 D England       18            3.39             3.56        0.111 
##  9 G Scott          6            2.83             2.17        0.333 
## 10 J Brooks        19            1.95             4.26        0.0526
## # ℹ 13 more rows

yc_by_ref <- pl |>
  filter(!is.na(Referee)) |>
  group_by(Referee) |>
  summarise(
    matches = n(),
    avg_home_yc = mean(HY),
    avg_away_yc = mean(AY),
    h_vs_a = avg_home_yc - avg_away_yc,
  ) |>
  filter(matches >= 10)
yc_by_ref

## # A tibble: 16 × 5
##    Referee     matches avg_home_yc avg_away_yc h_vs_a
##    <chr>         <int>       <dbl>       <dbl>  <dbl>
##  1 A Madley         24        1.42        2.04 -0.625
##  2 A Marriner       13        2.08        1.46  0.615
##  3 A Taylor         30        2           1.77  0.233
##  4 C Kavanagh       13        1           2    -1    
##  5 C Pawson         21        2.05        1.71  0.333
##  6 D Coote          21        1.71        1.95 -0.238
##  7 D England        18        1.28        2.28 -1    
##  8 J Brooks         19        2           2.26 -0.263
##  9 J Gillett        17        1.47        1.18  0.294
## 10 M Oliver         30        1.73        1.17  0.567
## 11 M Salisbury      15        2           2.33 -0.333
## 12 P Bankes         21        1.67        2.67 -1    
## 13 P Tierney        30        1.9         1.9   0    
## 14 R Jones          26        1.69        2.35 -0.654
## 15 S Attwell        25        1.28        1.96 -0.68 
## 16 S Hooper         29        1.38        2.03 -0.655

This query shows what refs end up favoring the home team

The next two groupings look at how teams preform as both the home and away team seperately

home_stats <- pl |>
  group_by(HomeTeam) |>
  summarise(
    matches = n(),
    home_goals_per_match = mean(FTHG),
    away_goals_allowed_per_match = mean(FTAG),
    home_win_rate = mean(FTR == "H"),
    shots_per_game = mean(HS),
    shots_on_target_per_game = mean(HST),
    on_target_rate = mean(HST / HS))
home_stats

## # A tibble: 20 × 8
##    HomeTeam    matches home_goals_per_match away_goals_allowed_p…¹ home_win_rate
##    <chr>         <int>                <dbl>                  <dbl>         <dbl>
##  1 Arsenal          19                2.79                   1.32          0.737
##  2 Aston Villa      19                1.74                   1.11          0.632
##  3 Bournemouth      19                1.05                   1.47          0.316
##  4 Brentford        19                1.84                   0.947         0.526
##  5 Brighton         19                1.95                   1.11          0.526
##  6 Chelsea          19                1.05                   1             0.316
##  7 Crystal Pa…      19                1.11                   1.21          0.368
##  8 Everton          19                0.842                  1.42          0.316
##  9 Fulham           19                1.63                   1.53          0.421
## 10 Leeds            19                1.37                   1.95          0.263
## 11 Leicester        19                1.21                   1.42          0.263
## 12 Liverpool        19                2.42                   0.895         0.684
## 13 Man City         19                3.16                   0.895         0.895
## 14 Man United       19                1.89                   0.526         0.789
## 15 Newcastle        19                1.89                   0.737         0.579
## 16 Nott'm For…      19                1.42                   1.26          0.421
## 17 Southampton      19                1                      1.95          0.105
## 18 Tottenham        19                1.95                   1.32          0.632
## 19 West Ham         19                1.37                   1.26          0.421
## 20 Wolves           19                1                      1.05          0.474
## # ℹ abbreviated name: ¹away_goals_allowed_per_match
## # ℹ 3 more variables: shots_per_game <dbl>, shots_on_target_per_game <dbl>,
## #   on_target_rate <dbl>

away_stats <- pl |>
  group_by(AwayTeam) |>
  summarise(
    matches = n(),
    away_goals_per_match = mean(FTAG),
    home_goals_allowed_per_match = mean(FTHG),
    home_win_rate = mean(FTR == "A"),
    shots_per_game = mean(AS),
    shots_on_target_per_game = mean(AST),
    on_target_rate = mean(AST / AS))
away_stats

## # A tibble: 20 × 8
##    AwayTeam    matches away_goals_per_match home_goals_allowed_p…¹ home_win_rate
##    <chr>         <int>                <dbl>                  <dbl>         <dbl>
##  1 Arsenal          19                1.84                   0.947        0.632 
##  2 Aston Villa      19                0.947                  1.32         0.316 
##  3 Bournemouth      19                0.895                  2.26         0.263 
##  4 Brentford        19                1.21                   1.47         0.263 
##  5 Brighton         19                1.84                   1.68         0.421 
##  6 Chelsea          19                0.947                  1.47         0.263 
##  7 Crystal Pa…      19                1                      1.37         0.211 
##  8 Everton          19                0.947                  1.58         0.105 
##  9 Fulham           19                1.26                   1.26         0.368 
## 10 Leeds            19                1.16                   2.16         0.105 
## 11 Leicester        19                1.47                   2.16         0.211 
## 12 Liverpool        19                1.53                   1.58         0.316 
## 13 Man City         19                1.79                   0.842        0.579 
## 14 Man United       19                1.16                   1.74         0.421 
## 15 Newcastle        19                1.68                   1            0.421 
## 16 Nott'm For…      19                0.579                  2.32         0.0526
## 17 Southampton      19                0.895                  1.89         0.211 
## 18 Tottenham        19                1.74                   2            0.316 
## 19 West Ham         19                0.842                  1.63         0.158 
## 20 Wolves           19                0.632                  2            0.105 
## # ℹ abbreviated name: ¹home_goals_allowed_per_match
## # ℹ 3 more variables: shots_per_game <dbl>, shots_on_target_per_game <dbl>,
## #   on_target_rate <dbl>

pl2 <- pl |>
  mutate(
    given_home_odds = 1 / AvgH,
    given_draw_odds = 1 / AvgD,
    given_away_odds = 1 / AvgA
  )

The way the odds are presented seem arbitrary or hard to understand. Especially when considering the English use a different system. For this reason I converted the odds into the probability they imply. Then before bucketing them I wanted to look at a histogram showing where most of these odds fall.

pl2 |>
  ggplot(aes(x = given_home_odds)) +
  geom_histogram(bins = 20)

pl2 <- pl2 |>
  mutate(
    given_home_odds_bracket = cut(
      given_home_odds,
      breaks = c(0, 0.2, 0.4, 0.6, 0.8, Inf),
      labels = c(
        "<20%", "20%-40%", "40%-60%", "60%-80%",
        "80%+"
      )
    )
  )

odds1 <- pl2 |>
  group_by(given_home_odds_bracket) |>
  summarise(
    matches = n(),
    home_win_rate = mean(FTR == "H"),
    draw_rate = mean(FTR == "D"),
    away_win_rate = mean(FTR == "A"),
    home_yellows = mean(HY)
  )
odds1

## # A tibble: 5 × 6
##   given_home_odds_bracket matches home_win_rate draw_rate away_win_rate
##   <fct>                     <int>         <dbl>     <dbl>         <dbl>
## 1 <20%                         41         0.220     0.244        0.537 
## 2 20%-40%                     122         0.352     0.246        0.402 
## 3 40%-60%                     122         0.484     0.254        0.262 
## 4 60%-80%                      76         0.763     0.171        0.0658
## 5 80%+                         19         0.789     0.158        0.0526
## # ℹ 1 more variable: home_yellows <dbl>

Results of bucketing

Building a helper to make finding rarest sample easiest

tag_rarest <- function(df){
  df |> mutate(p = matches/sum(matches),
               tag = if_else(p == min(p), "RAREST", ""))
}

goals_by_bracket <- pl |>
  mutate(goal_bracket = cut(TG,
    breaks = c(-Inf, 0, 1, 2, 3, 4, Inf),
    labels = c("0", "1", "2", "3", "4", "5+")
  )) |>
  group_by(goal_bracket) |>
  summarise(matches = n(), avg_yc = mean(YC), avg_rc = mean(RC), .groups="drop")

goals_by_bracket

## # A tibble: 6 × 4
##   goal_bracket matches avg_yc avg_rc
##   <fct>          <int>  <dbl>  <dbl>
## 1 0                 23   4.22 0.0870
## 2 1                 70   4.11 0.0286
## 3 2                 87   3.63 0.0690
## 4 3                 79   3.48 0.139 
## 5 4                 57   3.42 0.0526
## 6 5+                64   3    0.0625

0 goal games are the rarest

ref_stats2 <- tag_rarest(ref_stats)
odds1_2    <- tag_rarest(odds1)
goals2     <- tag_rarest(goals_by_bracket)

ref_stats2 |> filter(tag == "RAREST")

## # A tibble: 2 × 7
##   Referee  matches avg_total_goals avg_yellow_cards avg_red_cards       p tag   
##   <chr>      <int>           <dbl>            <dbl>         <dbl>   <dbl> <chr> 
## 1 J Smith        1               4                3             0 0.00263 RAREST
## 2 R Madley       1               2                6             1 0.00263 RAREST

odds1_2    |> filter(tag == "RAREST")

## # A tibble: 1 × 8
##   given_home_odds_bracket matches home_win_rate draw_rate away_win_rate
##   <fct>                     <int>         <dbl>     <dbl>         <dbl>
## 1 80%+                         19         0.789     0.158        0.0526
## # ℹ 3 more variables: home_yellows <dbl>, p <dbl>, tag <chr>

goals2     |> filter(tag == "RAREST")

## # A tibble: 1 × 6
##   goal_bracket matches avg_yc avg_rc      p tag   
##   <fct>          <int>  <dbl>  <dbl>  <dbl> <chr> 
## 1 0                 23   4.22 0.0870 0.0605 RAREST

Observations -for referee’s getting few games they might be replacement referees or outside their primary league -I would figure that a lower odds bracket would be the rarest for home win odds but 80% odds is pretty extreme -goalless games are unlikely in general because typically one of the two teams will not be satisfied with one point and play riskier towards the end of the games Testable hypothesis -the rarer refs only appear in a short time range (small PL stint) -rarer odd tiers are less likely to draw due to mismatches -high scoring matches are rarer because the shot conversion rate of those matches are unsustainable

ref_stats2 |>
  ggplot(aes(x = reorder(Referee, matches), y = matches)) +
  geom_col() +
  coord_flip() +
  labs(title = "Matches per Referee", x = "Referee", y = "Matches")

Full list of refs and how many matches they’ve refereed

pl2 |>
  count(given_home_odds_bracket, FTR) |>
  ggplot(aes(x = given_home_odds_bracket, y = n, fill = FTR)) +
  geom_col(position = "fill") +
  labs(title = "Match outcome by home win odds",
       x = "Home win odds",
       y = "Proportion of matches") +
  scale_fill_discrete(labels = c(H = "Home win", D = "Draw", A = "Away win"))

This graph shows the relationship between home win odds and results. As expected the results are pretty in line, however slighly lower do to the nature of betting odds.

goals2 |>
  ggplot(aes(x = goal_bracket, y = matches)) +
  geom_col() +
  labs(title = "How matches with x goals",
       x = "Total goals",
       y = "Matches")

Showing the distribution of goals scored in every PL match

ref_ftr <- pl |>
  count(Referee, FTR, name = "n") |>
  arrange(desc(n))
ref_ftr

## # A tibble: 63 × 3
##    Referee   FTR       n
##    <chr>     <chr> <int>
##  1 M Oliver  H        18
##  2 S Attwell H        18
##  3 R Jones   H        17
##  4 S Hooper  H        16
##  5 P Tierney H        15
##  6 A Taylor  A        13
##  7 P Bankes  H        13
##  8 A Madley  H        11
##  9 A Taylor  H        10
## 10 C Pawson  A        10
## # ℹ 53 more rows

ref_ftr_grid <- pl |>
  count(Referee, FTR, name="n") |>
  complete(Referee, FTR, fill = list(n = 0)) 
ref_ftr_grid

## # A tibble: 69 × 3
##    Referee    FTR       n
##    <chr>      <chr> <int>
##  1 A Madley   A         7
##  2 A Madley   D         6
##  3 A Madley   H        11
##  4 A Marriner A         5
##  5 A Marriner D         3
##  6 A Marriner H         5
##  7 A Taylor   A        13
##  8 A Taylor   D         7
##  9 A Taylor   H        10
## 10 C Kavanagh A         7
## # ℹ 59 more rows

Cross grid showing how often each referee gave what result

mc_ref_ftr  <- ref_ftr_grid |> arrange(desc(n))
lc_ref_ftr <- ref_ftr_grid |> filter(n > 0) |> arrange(n) 
mc_ref_ftr

## # A tibble: 69 × 3
##    Referee   FTR       n
##    <chr>     <chr> <int>
##  1 M Oliver  H        18
##  2 S Attwell H        18
##  3 R Jones   H        17
##  4 S Hooper  H        16
##  5 P Tierney H        15
##  6 A Taylor  A        13
##  7 P Bankes  H        13
##  8 A Madley  H        11
##  9 A Taylor  H        10
## 10 C Pawson  A        10
## # ℹ 59 more rows

lc_ref_ftr

## # A tibble: 63 × 3
##    Referee      FTR       n
##    <chr>        <chr> <int>
##  1 D Bond       D         1
##  2 D Bond       H         1
##  3 G Scott      H         1
##  4 J Smith      D         1
##  5 P Bankes     A         1
##  6 R Madley     D         1
##  7 T Harrington A         1
##  8 T Harrington D         1
##  9 C Kavanagh   D         2
## 10 D Bond       A         2
## # ℹ 53 more rows

Here are the least common combinations. Some explained by low matches reffed

ggplot(ref_ftr, aes(x = Referee, y = n, fill = FTR)) +
  geom_col() +
  coord_flip() +
  labs(title = "Referee vs Full Time Result", x = "Referee", y = "Match count") +
  scale_fill_discrete(labels = c(H = "Home win", D = "Draw", A = "Away win"))

A clean visualization showing how each referee’s matches end. A good way to compare the results to see if there are any outliers.

pl3 <- pl2 |>
  mutate(
    Odds1 = given_home_odds, 
    Odds1_bracket = cut(
      Odds1,
      breaks = c(0, 0.2, 0.4, 0.6, 0.8, Inf),
      labels = c("<20%", "20%-40%", "40%-60%", "60%-80%", "80%+"),
      right = TRUE
    )
  )

Odds1_FTR_grid <- pl3 |>
  count(Odds1_bracket, FTR, name = "n") |>
  complete(Odds1_bracket, FTR, fill = list(n = 0)) |>
  mutate(missing = (n == 0))

Odds1_FTR_grid

## # A tibble: 15 × 4
##    Odds1_bracket FTR       n missing
##    <fct>         <chr> <int> <lgl>  
##  1 <20%          A        22 FALSE  
##  2 <20%          D        10 FALSE  
##  3 <20%          H         9 FALSE  
##  4 20%-40%       A        49 FALSE  
##  5 20%-40%       D        30 FALSE  
##  6 20%-40%       H        43 FALSE  
##  7 40%-60%       A        32 FALSE  
##  8 40%-60%       D        31 FALSE  
##  9 40%-60%       H        59 FALSE  
## 10 60%-80%       A         5 FALSE  
## 11 60%-80%       D        13 FALSE  
## 12 60%-80%       H        58 FALSE  
## 13 80%+          A         1 FALSE  
## 14 80%+          D         3 FALSE  
## 15 80%+          H        15 FALSE

ggplot(Odds1_FTR_grid, aes(x = FTR, y = Odds1_bracket, fill = n)) +
  geom_tile() +
  labs(fill = "Matches", title = "Home Win Probability vs FTR",
       x = "Full Time Result", y = "Home win odds")

After bucketing win probabilities and turning the data into categorical I could show the relationship between betting odds and actual results.

datadive2

2026-01-30