data dive

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pl <- read_csv("C:/Users/bfunk/Downloads/E0.csv")
## Rows: 380 Columns: 106
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): Div, Date, HomeTeam, AwayTeam, FTR, HTR, Referee
## dbl  (98): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY,...
## time  (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The dataset separates stats for home and away teams, combining them allows me to see the total amount of cards each referee gives out, among other variables.

pl <- pl |>
  mutate(TG = FTHG + FTAG) |>
  mutate(YC = HY + AY)|>
  mutate(RC = HR + AR)

Looking at the average of total goals scored for each referee. - how much a ref stops the game, or how physical they let them play can affect the scoreline.

pl |>
  group_by(Referee) |>
  summarise(avg_total_goals = mean(TG))
## # A tibble: 23 × 2
##    Referee    avg_total_goals
##    <chr>                <dbl>
##  1 A Madley              2.88
##  2 A Marriner            3.23
##  3 A Taylor              2.9 
##  4 C Kavanagh            2.69
##  5 C Pawson              2.67
##  6 D Bond                2   
##  7 D Coote               2.95
##  8 D England             3.39
##  9 G Scott               2.83
## 10 J Brooks              1.95
## # ℹ 13 more rows

summary of card data by ref

cards <- pl |>
  group_by(Referee) |>
  summarise(avg_yellow_cards = mean(YC),
            avg_red_cards = mean(RC))
cards
## # A tibble: 23 × 3
##    Referee    avg_yellow_cards avg_red_cards
##    <chr>                 <dbl>         <dbl>
##  1 A Madley               3.46        0.125 
##  2 A Marriner             3.54        0.154 
##  3 A Taylor               3.77        0.1   
##  4 C Kavanagh             3           0     
##  5 C Pawson               3.76        0.0476
##  6 D Bond                 4           0.25  
##  7 D Coote                3.67        0.143 
##  8 D England              3.56        0.111 
##  9 G Scott                2.17        0.333 
## 10 J Brooks               4.26        0.0526
## # ℹ 13 more rows

For how many games each referee called

pl |>
  count(Referee)
## # A tibble: 23 × 2
##    Referee        n
##    <chr>      <int>
##  1 A Madley      24
##  2 A Marriner    13
##  3 A Taylor      30
##  4 C Kavanagh    13
##  5 C Pawson      21
##  6 D Bond         4
##  7 D Coote       21
##  8 D England     18
##  9 G Scott        6
## 10 J Brooks      19
## # ℹ 13 more rows

Most cards given in a match by ref, along with goals

pl |>
  group_by(Referee) |>
  summarise(max(YC),
            max(RC),
            max(TG))
## # A tibble: 23 × 4
##    Referee    `max(YC)` `max(RC)` `max(TG)`
##    <chr>          <dbl>     <dbl>     <dbl>
##  1 A Madley           8         2         7
##  2 A Marriner         8         1         5
##  3 A Taylor           7         1         6
##  4 C Kavanagh         9         0         5
##  5 C Pawson           7         1         7
##  6 D Bond             8         1         3
##  7 D Coote            9         2         7
##  8 D England          8         1         8
##  9 G Scott            5         1         5
## 10 J Brooks           8         1         6
## # ℹ 13 more rows

More general statistics grouped by refree

pl |>
  group_by(Referee) |>
  summarise(across(where(is.numeric), mean))
## # A tibble: 23 × 102
##    Referee     FTHG  FTAG  HTHG  HTAG    HS    AS   HST   AST    HF    AF    HC
##    <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 A Madley    1.67 1.21  0.75  0.417  15.8  9.96  5.38  3.75 10.6  12.1   6.29
##  2 A Marriner  1.62 1.62  0.615 0.692  16.6 10.8   5.85  4.08  9.69  9.46  6.46
##  3 A Taylor    1.23 1.67  0.5   0.833  12.8 11.9   4.03  4.8   9.8  10.0   5.37
##  4 C Kavanagh  1.31 1.38  0.769 0.769  13.9 11.5   5.08  3.85 10.3  10.4   6.77
##  5 C Pawson    1.05 1.62  0.714 0.857  12   11.4   3.81  4.10 11.3  12.6   4.19
##  6 D Bond      1    1     0.75  0.5    13.5 13.8   3.75  3.75 10.8   9     5.75
##  7 D Coote     1.76 1.19  1.05  0.714  13.9 12.2   4.52  4.38 11.2  12.2   5.43
##  8 D England   1.94 1.44  0.833 0.778  14.9  9.78  5.44  3.56 10.2  11     6.83
##  9 G Scott     1.33 1.5   0.833 1      10.2 13.8   3.83  4.83  9.67 10.2   4   
## 10 J Brooks    1.26 0.684 0.526 0.316  13.2  9.84  4.05  3.16 10.3  11.8   6.74
## # ℹ 13 more rows
## # ℹ 90 more variables: AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>,
## #   B365H <dbl>, B365D <dbl>, B365A <dbl>, BWH <dbl>, BWD <dbl>, BWA <dbl>,
## #   IWH <dbl>, IWD <dbl>, IWA <dbl>, PSH <dbl>, PSD <dbl>, PSA <dbl>,
## #   WHH <dbl>, WHD <dbl>, WHA <dbl>, VCH <dbl>, VCD <dbl>, VCA <dbl>,
## #   MaxH <dbl>, MaxD <dbl>, MaxA <dbl>, AvgH <dbl>, AvgD <dbl>, AvgA <dbl>,
## #   `B365>2.5` <dbl>, `B365<2.5` <dbl>, `P>2.5` <dbl>, `P<2.5` <dbl>, …

Comparing now to the average

pl |>
  summarise(across(where(is.numeric), mean))
## # A tibble: 1 × 101
##    FTHG  FTAG  HTHG  HTAG    HS    AS   HST   AST    HF    AF    HC    AC    HY
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  1.63  1.22 0.758 0.563  14.0  11.3  4.91  3.89  10.6  10.9  5.64  4.47  1.67
## # ℹ 88 more variables: AY <dbl>, HR <dbl>, AR <dbl>, B365H <dbl>, B365D <dbl>,
## #   B365A <dbl>, BWH <dbl>, BWD <dbl>, BWA <dbl>, IWH <dbl>, IWD <dbl>,
## #   IWA <dbl>, PSH <dbl>, PSD <dbl>, PSA <dbl>, WHH <dbl>, WHD <dbl>,
## #   WHA <dbl>, VCH <dbl>, VCD <dbl>, VCA <dbl>, MaxH <dbl>, MaxD <dbl>,
## #   MaxA <dbl>, AvgH <dbl>, AvgD <dbl>, AvgA <dbl>, `B365>2.5` <dbl>,
## #   `B365<2.5` <dbl>, `P>2.5` <dbl>, `P<2.5` <dbl>, `Max>2.5` <dbl>,
## #   `Max<2.5` <dbl>, `Avg>2.5` <dbl>, `Avg<2.5` <dbl>, AHh <dbl>, …
summary(pl)
##      Div                Date                Time                
##  Length:380         Length:380         Min.   :12:00:00.000000  
##  Class :character   Class :character   1st Qu.:15:00:00.000000  
##  Mode  :character   Mode  :character   Median :15:00:00.000000  
##                                        Mean   :16:12:26.052632  
##                                        3rd Qu.:17:30:00.000000  
##                                        Max.   :20:15:00.000000  
##                                                                 
##    HomeTeam           AwayTeam              FTHG            FTAG      
##  Length:380         Length:380         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :1.000   Median :1.000  
##                                        Mean   :1.634   Mean   :1.218  
##                                        3rd Qu.:2.000   3rd Qu.:2.000  
##                                        Max.   :9.000   Max.   :6.000  
##                                                                       
##      FTR                 HTHG             HTAG            HTR           
##  Length:380         Min.   :0.0000   Min.   :0.0000   Length:380        
##  Class :character   1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
##  Mode  :character   Median :1.0000   Median :0.0000   Mode  :character  
##                     Mean   :0.7579   Mean   :0.5632                     
##                     3rd Qu.:1.0000   3rd Qu.:1.0000                     
##                     Max.   :5.0000   Max.   :3.0000                     
##                                                                         
##    Referee                HS              AS             HST        
##  Length:380         Min.   : 1.00   Min.   : 1.00   Min.   : 0.000  
##  Class :character   1st Qu.:10.00   1st Qu.: 8.00   1st Qu.: 3.000  
##  Mode  :character   Median :14.00   Median :11.00   Median : 5.000  
##                     Mean   :13.95   Mean   :11.31   Mean   : 4.908  
##                     3rd Qu.:17.00   3rd Qu.:15.00   3rd Qu.: 7.000  
##                     Max.   :33.00   Max.   :30.00   Max.   :15.000  
##                                                                     
##       AST               HF             AF              HC        
##  Min.   : 0.000   Min.   : 3.0   Min.   : 3.00   Min.   : 0.000  
##  1st Qu.: 2.000   1st Qu.: 8.0   1st Qu.: 8.00   1st Qu.: 3.000  
##  Median : 4.000   Median :10.0   Median :11.00   Median : 5.000  
##  Mean   : 3.895   Mean   :10.6   Mean   :10.93   Mean   : 5.637  
##  3rd Qu.: 5.000   3rd Qu.:13.0   3rd Qu.:13.00   3rd Qu.: 8.000  
##  Max.   :11.000   Max.   :23.0   Max.   :24.00   Max.   :17.000  
##                                                                  
##        AC               HY              AY              HR         
##  Min.   : 0.000   Min.   :0.000   Min.   :0.000   Min.   :0.00000  
##  1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.00000  
##  Median : 4.000   Median :2.000   Median :2.000   Median :0.00000  
##  Mean   : 4.471   Mean   :1.671   Mean   :1.916   Mean   :0.04737  
##  3rd Qu.: 6.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:0.00000  
##  Max.   :19.000   Max.   :6.000   Max.   :7.000   Max.   :2.00000  
##                                                                    
##        AR              B365H            B365D            B365A       
##  Min.   :0.00000   Min.   : 1.060   Min.   : 3.000   Min.   : 1.250  
##  1st Qu.:0.00000   1st Qu.: 1.650   1st Qu.: 3.500   1st Qu.: 2.150  
##  Median :0.00000   Median : 2.250   Median : 3.750   Median : 3.200  
##  Mean   :0.02632   Mean   : 2.835   Mean   : 4.158   Mean   : 4.421  
##  3rd Qu.:0.00000   3rd Qu.: 3.300   3rd Qu.: 4.330   3rd Qu.: 5.250  
##  Max.   :1.00000   Max.   :12.000   Max.   :13.000   Max.   :29.000  
##                                                                      
##       BWH              BWD              BWA              IWH        
##  Min.   : 1.070   Min.   : 3.000   Min.   : 1.250   Min.   : 1.070  
##  1st Qu.: 1.650   1st Qu.: 3.400   1st Qu.: 2.150   1st Qu.: 1.650  
##  Median : 2.250   Median : 3.700   Median : 3.150   Median : 2.300  
##  Mean   : 2.801   Mean   : 4.144   Mean   : 4.464   Mean   : 2.821  
##  3rd Qu.: 3.300   3rd Qu.: 4.348   3rd Qu.: 5.000   3rd Qu.: 3.300  
##  Max.   :12.000   Max.   :12.500   Max.   :41.000   Max.   :12.000  
##                                                                     
##       IWD              IWA              PSH              PSD        
##  Min.   : 3.050   Min.   : 1.250   Min.   : 1.070   Min.   : 3.110  
##  1st Qu.: 3.450   1st Qu.: 2.200   1st Qu.: 1.680   1st Qu.: 3.550  
##  Median : 3.700   Median : 3.150   Median : 2.330   Median : 3.835  
##  Mean   : 4.146   Mean   : 4.352   Mean   : 2.924   Mean   : 4.290  
##  3rd Qu.: 4.400   3rd Qu.: 5.250   3rd Qu.: 3.430   3rd Qu.: 4.490  
##  Max.   :12.000   Max.   :28.000   Max.   :12.650   Max.   :13.820  
##                                                                     
##       PSA              WHH              WHD              WHA        
##  Min.   : 1.240   Min.   : 1.050   Min.   : 3.000   Min.   : 1.240  
##  1st Qu.: 2.217   1st Qu.: 1.645   1st Qu.: 3.400   1st Qu.: 2.150  
##  Median : 3.320   Median : 2.250   Median : 3.600   Median : 3.200  
##  Mean   : 4.662   Mean   : 2.844   Mean   : 4.036   Mean   : 4.534  
##  3rd Qu.: 5.380   3rd Qu.: 3.300   3rd Qu.: 4.200   3rd Qu.: 5.062  
##  Max.   :41.000   Max.   :13.000   Max.   :13.000   Max.   :34.000  
##                                                                     
##       VCH              VCD              VCA              MaxH       
##  Min.   : 1.060   Min.   : 2.900   Min.   : 1.220   Min.   : 1.090  
##  1st Qu.: 1.620   1st Qu.: 3.300   1st Qu.: 2.138   1st Qu.: 1.710  
##  Median : 2.250   Median : 3.600   Median : 3.165   Median : 2.380  
##  Mean   : 2.859   Mean   : 4.021   Mean   : 4.626   Mean   : 3.019  
##  3rd Qu.: 3.300   3rd Qu.: 4.200   3rd Qu.: 5.250   3rd Qu.: 3.520  
##  Max.   :13.000   Max.   :11.500   Max.   :31.000   Max.   :13.500  
##                                                                     
##       MaxD             MaxA             AvgH             AvgD       
##  Min.   : 3.230   Min.   : 1.270   Min.   : 1.070   Min.   : 3.080  
##  1st Qu.: 3.640   1st Qu.: 2.240   1st Qu.: 1.667   1st Qu.: 3.480  
##  Median : 3.950   Median : 3.325   Median : 2.300   Median : 3.760  
##  Mean   : 4.454   Mean   : 4.999   Mean   : 2.872   Mean   : 4.216  
##  3rd Qu.: 4.662   3rd Qu.: 5.550   3rd Qu.: 3.368   3rd Qu.: 4.423  
##  Max.   :16.000   Max.   :50.000   Max.   :12.480   Max.   :12.840  
##                                                                     
##       AvgA           B365>2.5        B365<2.5         P>2.5      
##  Min.   : 1.250   Min.   :1.220   Min.   :1.530   Min.   :1.310  
##  1st Qu.: 2.178   1st Qu.:1.660   1st Qu.:1.870   1st Qu.:1.680  
##  Median : 3.195   Median :1.860   Median :2.020   Median :1.860  
##  Mean   : 4.567   Mean   :1.833   Mean   :2.096   Mean   :1.858  
##  3rd Qu.: 5.192   3rd Qu.:2.020   3rd Qu.:2.200   3rd Qu.:2.045  
##  Max.   :37.570   Max.   :2.500   Max.   :4.330   Max.   :2.550  
##                                                   NA's   :1      
##      P<2.5          Max>2.5         Max<2.5         Avg>2.5     
##  Min.   :1.560   Min.   :1.270   Min.   :1.600   Min.   :1.240  
##  1st Qu.:1.860   1st Qu.:1.710   1st Qu.:1.900   1st Qu.:1.650  
##  Median :2.050   Median :1.885   Median :2.080   Median :1.820  
##  Mean   :2.121   Mean   :1.889   Mean   :2.177   Mean   :1.823  
##  3rd Qu.:2.295   3rd Qu.:2.070   3rd Qu.:2.350   3rd Qu.:1.992  
##  Max.   :3.460   Max.   :2.560   Max.   :4.330   Max.   :2.470  
##  NA's   :1                                                      
##     Avg<2.5           AHh             B365AHH         B365AHA     
##  Min.   :1.540   Min.   :-3.0000   Min.   :1.700   Min.   :1.750  
##  1st Qu.:1.830   1st Qu.:-0.7500   1st Qu.:1.880   1st Qu.:1.877  
##  Median :2.010   Median :-0.2500   Median :1.965   Median :1.940  
##  Mean   :2.093   Mean   :-0.2908   Mean   :1.956   Mean   :1.943  
##  3rd Qu.:2.270   3rd Qu.: 0.2500   3rd Qu.:2.030   3rd Qu.:2.020  
##  Max.   :4.070   Max.   : 1.7500   Max.   :2.160   Max.   :2.140  
##                                                                   
##       PAHH            PAHA           MaxAHH          MaxAHA     
##  Min.   :1.760   Min.   :1.760   Min.   :1.810   Min.   :1.810  
##  1st Qu.:1.890   1st Qu.:1.880   1st Qu.:1.920   1st Qu.:1.910  
##  Median :1.980   Median :1.930   Median :2.000   Median :1.980  
##  Mean   :1.974   Mean   :1.947   Mean   :1.998   Mean   :1.983  
##  3rd Qu.:2.050   3rd Qu.:2.020   3rd Qu.:2.070   3rd Qu.:2.060  
##  Max.   :2.190   Max.   :2.190   Max.   :2.200   Max.   :2.190  
##                                                                 
##      AvgAHH          AvgAHA          B365CH           B365CD      
##  Min.   :1.760   Min.   :1.760   Min.   : 1.080   Min.   : 3.000  
##  1st Qu.:1.870   1st Qu.:1.860   1st Qu.: 1.620   1st Qu.: 3.500  
##  Median :1.945   Median :1.920   Median : 2.250   Median : 3.750  
##  Mean   :1.944   Mean   :1.927   Mean   : 2.818   Mean   : 4.127  
##  3rd Qu.:2.010   3rd Qu.:2.000   3rd Qu.: 3.300   3rd Qu.: 4.330  
##  Max.   :2.130   Max.   :2.130   Max.   :13.000   Max.   :12.000  
##                                                                   
##      B365CA            BWCH             BWCD             BWCA       
##  Min.   : 1.220   Min.   : 1.090   Min.   : 2.950   Min.   : 1.260  
##  1st Qu.: 2.200   1st Qu.: 1.630   1st Qu.: 3.400   1st Qu.: 2.150  
##  Median : 3.150   Median : 2.250   Median : 3.700   Median : 3.150  
##  Mean   : 4.312   Mean   : 2.812   Mean   : 4.081   Mean   : 4.293  
##  3rd Qu.: 5.250   3rd Qu.: 3.250   3rd Qu.: 4.330   3rd Qu.: 5.250  
##  Max.   :23.000   Max.   :11.500   Max.   :12.000   Max.   :26.000  
##                                                                     
##       IWCH             IWCD             IWCA             PSCH       
##  Min.   : 1.080   Min.   : 3.000   Min.   : 1.250   Min.   : 1.080  
##  1st Qu.: 1.650   1st Qu.: 3.400   1st Qu.: 2.200   1st Qu.: 1.660  
##  Median : 2.300   Median : 3.650   Median : 3.150   Median : 2.325  
##  Mean   : 2.829   Mean   : 4.079   Mean   : 4.262   Mean   : 2.930  
##  3rd Qu.: 3.300   3rd Qu.: 4.300   3rd Qu.: 5.250   3rd Qu.: 3.400  
##  Max.   :11.000   Max.   :12.000   Max.   :24.000   Max.   :12.000  
##                                                                     
##       PSCD             PSCA             WHCH             WHCD       
##  Min.   : 3.050   Min.   : 1.260   Min.   : 1.060   Min.   : 3.000  
##  1st Qu.: 3.510   1st Qu.: 2.265   1st Qu.: 1.630   1st Qu.: 3.300  
##  Median : 3.815   Median : 3.270   Median : 2.250   Median : 3.600  
##  Mean   : 4.269   Mean   : 4.585   Mean   : 2.871   Mean   : 3.976  
##  3rd Qu.: 4.420   3rd Qu.: 5.525   3rd Qu.: 3.325   3rd Qu.: 4.200  
##  Max.   :13.500   Max.   :36.000   Max.   :12.000   Max.   :11.000  
##                                                                     
##       WHCA             VCCH             VCCD             VCCA       
##  Min.   : 1.240   Min.   : 1.060   Min.   : 2.880   Min.   : 1.220  
##  1st Qu.: 2.150   1st Qu.: 1.600   1st Qu.: 3.300   1st Qu.: 2.188  
##  Median : 3.200   Median : 2.250   Median : 3.600   Median : 3.200  
##  Mean   : 4.499   Mean   : 2.871   Mean   : 4.054   Mean   : 4.592  
##  3rd Qu.: 5.250   3rd Qu.: 3.300   3rd Qu.: 4.200   3rd Qu.: 5.500  
##  Max.   :34.000   Max.   :13.000   Max.   :13.000   Max.   :41.000  
##                                                                     
##      MaxCH            MaxCD            MaxCA            AvgCH       
##  Min.   : 1.090   Min.   : 3.120   Min.   : 1.280   Min.   : 1.080  
##  1st Qu.: 1.728   1st Qu.: 3.600   1st Qu.: 2.295   1st Qu.: 1.655  
##  Median : 2.420   Median : 3.915   Median : 3.435   Median : 2.315  
##  Mean   : 3.094   Mean   : 4.436   Mean   : 4.988   Mean   : 2.886  
##  3rd Qu.: 3.620   3rd Qu.: 4.612   3rd Qu.: 5.812   3rd Qu.: 3.340  
##  Max.   :14.400   Max.   :15.000   Max.   :41.000   Max.   :12.150  
##                                                                     
##      AvgCD            AvgCA          B365C>2.5       B365C<2.5    
##  Min.   : 3.030   Min.   : 1.250   Min.   :1.250   Min.   :1.440  
##  1st Qu.: 3.460   1st Qu.: 2.208   1st Qu.:1.620   1st Qu.:1.857  
##  Median : 3.750   Median : 3.235   Median :1.895   Median :2.000  
##  Mean   : 4.186   Mean   : 4.492   Mean   :1.842   Mean   :2.103  
##  3rd Qu.: 4.362   3rd Qu.: 5.340   3rd Qu.:2.040   3rd Qu.:2.300  
##  Max.   :12.660   Max.   :32.000   Max.   :2.750   Max.   :4.000  
##                                                                   
##      PC>2.5          PC<2.5         MaxC>2.5        MaxC<2.5    
##  Min.   :1.330   Min.   :1.480   Min.   :1.270   Min.   :1.520  
##  1st Qu.:1.680   1st Qu.:1.860   1st Qu.:1.718   1st Qu.:1.920  
##  Median :1.890   Median :2.010   Median :1.930   Median :2.100  
##  Mean   :1.873   Mean   :2.126   Mean   :1.921   Mean   :2.223  
##  3rd Qu.:2.050   3rd Qu.:2.320   3rd Qu.:2.120   3rd Qu.:2.450  
##  Max.   :2.860   Max.   :3.450   Max.   :2.870   Max.   :4.550  
##  NA's   :1       NA's   :1                                      
##     AvgC>2.5        AvgC<2.5          AHCh            B365CAHH    
##  Min.   :1.240   Min.   :1.470   Min.   :-2.7500   Min.   :1.650  
##  1st Qu.:1.640   1st Qu.:1.837   1st Qu.:-0.7500   1st Qu.:1.880  
##  Median :1.850   Median :1.980   Median :-0.2500   Median :1.960  
##  Mean   :1.837   Mean   :2.101   Mean   :-0.2816   Mean   :1.953  
##  3rd Qu.:2.002   3rd Qu.:2.290   3rd Qu.: 0.2500   3rd Qu.:2.040  
##  Max.   :2.690   Max.   :4.060   Max.   : 1.7500   Max.   :2.350  
##                                                                   
##     B365CAHA         PCAHH           PCAHA          MaxCAHH     
##  Min.   :1.580   Min.   :1.710   Min.   :1.610   Min.   :1.810  
##  1st Qu.:1.870   1st Qu.:1.880   1st Qu.:1.880   1st Qu.:1.930  
##  Median :1.950   Median :1.960   Median :1.950   Median :2.010  
##  Mean   :1.949   Mean   :1.971   Mean   :1.958   Mean   :2.019  
##  3rd Qu.:2.030   3rd Qu.:2.050   3rd Qu.:2.040   3rd Qu.:2.100  
##  Max.   :2.200   Max.   :2.480   Max.   :2.270   Max.   :2.520  
##                                                                 
##     MaxCAHA         AvgCAHH         AvgCAHA            TG       
##  Min.   :1.650   Min.   :1.760   Min.   :1.600   Min.   :0.000  
##  1st Qu.:1.930   1st Qu.:1.860   1st Qu.:1.860   1st Qu.:2.000  
##  Median :2.010   Median :1.940   Median :1.930   Median :3.000  
##  Mean   :2.011   Mean   :1.943   Mean   :1.933   Mean   :2.853  
##  3rd Qu.:2.090   3rd Qu.:2.020   3rd Qu.:2.010   3rd Qu.:4.000  
##  Max.   :2.290   Max.   :2.420   Max.   :2.120   Max.   :9.000  
##                                                                 
##        YC              RC         
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:2.000   1st Qu.:0.00000  
##  Median :3.000   Median :0.00000  
##  Mean   :3.587   Mean   :0.07368  
##  3rd Qu.:5.000   3rd Qu.:0.00000  
##  Max.   :9.000   Max.   :2.00000  
## 

A couple more standard metrics

quantile(pl$TG)
##   0%  25%  50%  75% 100% 
##    0    2    3    4    9
quantile(pl$YC)
##   0%  25%  50%  75% 100% 
##    0    2    3    5    9
quantile(pl$HS)
##   0%  25%  50%  75% 100% 
##    1   10   14   17   33
quantile(pl$AS)
##   0%  25%  50%  75% 100% 
##    1    8   11   15   30
n_distinct(pl$Referee)
## [1] 23

Do any referees favor home teams?

win_rate_by_ref <- pl |>
  group_by(Referee) |>
  summarise(
    matches = n(),
    home_win_rate = mean(FTR == "H"),
    away_win_rate = mean(FTR == "A")
  ) |>
  filter(matches >=10) |>        
  arrange(desc(home_win_rate))
win_rate_by_ref 
## # A tibble: 16 × 4
##    Referee     matches home_win_rate away_win_rate
##    <chr>         <int>         <dbl>         <dbl>
##  1 S Attwell        25         0.72         0.12  
##  2 R Jones          26         0.654        0.269 
##  3 P Bankes         21         0.619        0.0476
##  4 M Oliver         30         0.6          0.233 
##  5 S Hooper         29         0.552        0.172 
##  6 M Salisbury      15         0.533        0.333 
##  7 P Tierney        30         0.5          0.3   
##  8 J Brooks         19         0.474        0.211 
##  9 A Madley         24         0.458        0.292 
## 10 D England        18         0.444        0.278 
## 11 D Coote          21         0.429        0.333 
## 12 A Marriner       13         0.385        0.385 
## 13 A Taylor         30         0.333        0.433 
## 14 C Pawson         21         0.333        0.476 
## 15 C Kavanagh       13         0.308        0.538 
## 16 J Gillett        17         0.294        0.412

While comparing to the overall average

pl |>
  summarise(
    home_win_rate = mean(FTR == "H"),
    away_win_rate = mean(FTR == "A")
  )
## # A tibble: 1 × 2
##   home_win_rate away_win_rate
##           <dbl>         <dbl>
## 1         0.484         0.287
ref_home <- pl |>
  group_by(Referee)|>
  summarise(
    matches = n(),
    home_win_rate = mean(FTR == "H")
  ) |>
  filter(matches >= 10)

This graph shows that some referees give the home side a better chance of walking away with three points. Atwell, Jones, Bankes, and Oliver are the most significant names here.

ggplot(data = win_rate_by_ref, aes(x = Referee, home_win_rate)) +
  geom_col() +
   coord_flip() +
  scale_y_continuous(limits = c(0, 1)) +
  labs(
    title = "Home win rate by referee",
    x = "Referee",
    y = "Home win rate"
  )

Which Referees give out the most red cards? Yellow cards?

ggplot(data = cards, aes(x = Referee, avg_red_cards)) +
  geom_col() +
   coord_flip()  +
  labs(
    title = "Red Cards per Match by Referee",
    x = "Referee",
    y = "Red Cards per Match"
  )

I will have to explore Madley’s RC number more. Scott and Bond’s numbers are interesting.

ggplot(data = cards, aes(x = Referee, avg_yellow_cards)) +
  geom_col() +
   coord_flip()  +
  labs(
    title = "Yellow Cards per Match by referee",
    x = "Referee",
    y = "Yellow Cards per Match"
  )

cards |>
  ggplot() +
  geom_point(mapping = aes(x = avg_yellow_cards , y = avg_red_cards, 
             color = Referee)) +
  labs(title = "Red and Yellow cards",
       x = "Yellow cards", y = "Red cards") +
  theme_classic()

yc_by_ref <- pl |>
  filter(!is.na(Referee)) |>
  group_by(Referee) |>
  summarise(
    matches = n(),
    avg_home_yc = mean(HY),
    avg_away_yc = mean(AY),
    h_vs_a = avg_home_yc - avg_away_yc,
  ) |>
  filter(matches >= 10)
yc_by_ref
## # A tibble: 16 × 5
##    Referee     matches avg_home_yc avg_away_yc h_vs_a
##    <chr>         <int>       <dbl>       <dbl>  <dbl>
##  1 A Madley         24        1.42        2.04 -0.625
##  2 A Marriner       13        2.08        1.46  0.615
##  3 A Taylor         30        2           1.77  0.233
##  4 C Kavanagh       13        1           2    -1    
##  5 C Pawson         21        2.05        1.71  0.333
##  6 D Coote          21        1.71        1.95 -0.238
##  7 D England        18        1.28        2.28 -1    
##  8 J Brooks         19        2           2.26 -0.263
##  9 J Gillett        17        1.47        1.18  0.294
## 10 M Oliver         30        1.73        1.17  0.567
## 11 M Salisbury      15        2           2.33 -0.333
## 12 P Bankes         21        1.67        2.67 -1    
## 13 P Tierney        30        1.9         1.9   0    
## 14 R Jones          26        1.69        2.35 -0.654
## 15 S Attwell        25        1.28        1.96 -0.68 
## 16 S Hooper         29        1.38        2.03 -0.655

Do any referees favor the home team when punishing players? Does crowd influence have an effect on referees?

ggplot(yc_by_ref, aes(x = reorder(Referee, h_vs_a), y = h_vs_a)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Avg yellow cards: Home minus Away(left more yellows for home, right away)",
    x = "Referee",
    y = "Avg Home Yellows - Avg Away Yellows (per match)"
  )

Most referee’s do end up favoring the home team when giving out cards. Some people say that they succumb to the crowd, others may argue that away teams usually are on defense more as a tactical choice which leads to more fouls, as well as cards.

Histogram of home and away betting odds distubution

pl |>
  ggplot(aes(x = B365H)) +
  geom_histogram(bins = 10) +
  labs(
    title = "Histogram of Bet365 Home Win Odds",
    x = "Decimal odds",
    y = "Match count"
  )

pl |>
  ggplot(aes(x = B365A)) +
  geom_histogram(bins = 5) +
  labs(
    title = "Histogram of Bet365 Away Win Odds",
    x = "Decimal odds",
    y = "Match count"
  )

The histogram bins were hard to figure out. Most of the options made it unreadable, but you can still see the distribution of odds here.

The box plot showing how betting odds line up with results is a clearer look at the betting data. I will have to think of ways to Incorporate this part of the dataset.

pl |>
  ggplot(aes(x = FTR, y = B365H, fill = FTR)) +
  geom_boxplot() +
  labs(title = "Distribution of home odds by match result",
       x = "Result (Home win, Draw, or Away win)",
       y = "Bet365 home odds")

.