library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pl <- read_csv("C:/Users/bfunk/Downloads/E0.csv")
## Rows: 380 Columns: 106
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Div, Date, HomeTeam, AwayTeam, FTR, HTR, Referee
## dbl (98): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY,...
## time (1): Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The dataset separates stats for home and away teams, combining them allows me to see the total amount of cards each referee gives out, among other variables.
pl <- pl |>
mutate(TG = FTHG + FTAG) |>
mutate(YC = HY + AY)|>
mutate(RC = HR + AR)
Looking at the average of total goals scored for each referee. - how much a ref stops the game, or how physical they let them play can affect the scoreline.
pl |>
group_by(Referee) |>
summarise(avg_total_goals = mean(TG))
## # A tibble: 23 × 2
## Referee avg_total_goals
## <chr> <dbl>
## 1 A Madley 2.88
## 2 A Marriner 3.23
## 3 A Taylor 2.9
## 4 C Kavanagh 2.69
## 5 C Pawson 2.67
## 6 D Bond 2
## 7 D Coote 2.95
## 8 D England 3.39
## 9 G Scott 2.83
## 10 J Brooks 1.95
## # ℹ 13 more rows
summary of card data by ref
cards <- pl |>
group_by(Referee) |>
summarise(avg_yellow_cards = mean(YC),
avg_red_cards = mean(RC))
cards
## # A tibble: 23 × 3
## Referee avg_yellow_cards avg_red_cards
## <chr> <dbl> <dbl>
## 1 A Madley 3.46 0.125
## 2 A Marriner 3.54 0.154
## 3 A Taylor 3.77 0.1
## 4 C Kavanagh 3 0
## 5 C Pawson 3.76 0.0476
## 6 D Bond 4 0.25
## 7 D Coote 3.67 0.143
## 8 D England 3.56 0.111
## 9 G Scott 2.17 0.333
## 10 J Brooks 4.26 0.0526
## # ℹ 13 more rows
For how many games each referee called
pl |>
count(Referee)
## # A tibble: 23 × 2
## Referee n
## <chr> <int>
## 1 A Madley 24
## 2 A Marriner 13
## 3 A Taylor 30
## 4 C Kavanagh 13
## 5 C Pawson 21
## 6 D Bond 4
## 7 D Coote 21
## 8 D England 18
## 9 G Scott 6
## 10 J Brooks 19
## # ℹ 13 more rows
Most cards given in a match by ref, along with goals
pl |>
group_by(Referee) |>
summarise(max(YC),
max(RC),
max(TG))
## # A tibble: 23 × 4
## Referee `max(YC)` `max(RC)` `max(TG)`
## <chr> <dbl> <dbl> <dbl>
## 1 A Madley 8 2 7
## 2 A Marriner 8 1 5
## 3 A Taylor 7 1 6
## 4 C Kavanagh 9 0 5
## 5 C Pawson 7 1 7
## 6 D Bond 8 1 3
## 7 D Coote 9 2 7
## 8 D England 8 1 8
## 9 G Scott 5 1 5
## 10 J Brooks 8 1 6
## # ℹ 13 more rows
More general statistics grouped by refree
pl |>
group_by(Referee) |>
summarise(across(where(is.numeric), mean))
## # A tibble: 23 × 102
## Referee FTHG FTAG HTHG HTAG HS AS HST AST HF AF HC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A Madley 1.67 1.21 0.75 0.417 15.8 9.96 5.38 3.75 10.6 12.1 6.29
## 2 A Marriner 1.62 1.62 0.615 0.692 16.6 10.8 5.85 4.08 9.69 9.46 6.46
## 3 A Taylor 1.23 1.67 0.5 0.833 12.8 11.9 4.03 4.8 9.8 10.0 5.37
## 4 C Kavanagh 1.31 1.38 0.769 0.769 13.9 11.5 5.08 3.85 10.3 10.4 6.77
## 5 C Pawson 1.05 1.62 0.714 0.857 12 11.4 3.81 4.10 11.3 12.6 4.19
## 6 D Bond 1 1 0.75 0.5 13.5 13.8 3.75 3.75 10.8 9 5.75
## 7 D Coote 1.76 1.19 1.05 0.714 13.9 12.2 4.52 4.38 11.2 12.2 5.43
## 8 D England 1.94 1.44 0.833 0.778 14.9 9.78 5.44 3.56 10.2 11 6.83
## 9 G Scott 1.33 1.5 0.833 1 10.2 13.8 3.83 4.83 9.67 10.2 4
## 10 J Brooks 1.26 0.684 0.526 0.316 13.2 9.84 4.05 3.16 10.3 11.8 6.74
## # ℹ 13 more rows
## # ℹ 90 more variables: AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>,
## # B365H <dbl>, B365D <dbl>, B365A <dbl>, BWH <dbl>, BWD <dbl>, BWA <dbl>,
## # IWH <dbl>, IWD <dbl>, IWA <dbl>, PSH <dbl>, PSD <dbl>, PSA <dbl>,
## # WHH <dbl>, WHD <dbl>, WHA <dbl>, VCH <dbl>, VCD <dbl>, VCA <dbl>,
## # MaxH <dbl>, MaxD <dbl>, MaxA <dbl>, AvgH <dbl>, AvgD <dbl>, AvgA <dbl>,
## # `B365>2.5` <dbl>, `B365<2.5` <dbl>, `P>2.5` <dbl>, `P<2.5` <dbl>, …
Comparing now to the average
pl |>
summarise(across(where(is.numeric), mean))
## # A tibble: 1 × 101
## FTHG FTAG HTHG HTAG HS AS HST AST HF AF HC AC HY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.63 1.22 0.758 0.563 14.0 11.3 4.91 3.89 10.6 10.9 5.64 4.47 1.67
## # ℹ 88 more variables: AY <dbl>, HR <dbl>, AR <dbl>, B365H <dbl>, B365D <dbl>,
## # B365A <dbl>, BWH <dbl>, BWD <dbl>, BWA <dbl>, IWH <dbl>, IWD <dbl>,
## # IWA <dbl>, PSH <dbl>, PSD <dbl>, PSA <dbl>, WHH <dbl>, WHD <dbl>,
## # WHA <dbl>, VCH <dbl>, VCD <dbl>, VCA <dbl>, MaxH <dbl>, MaxD <dbl>,
## # MaxA <dbl>, AvgH <dbl>, AvgD <dbl>, AvgA <dbl>, `B365>2.5` <dbl>,
## # `B365<2.5` <dbl>, `P>2.5` <dbl>, `P<2.5` <dbl>, `Max>2.5` <dbl>,
## # `Max<2.5` <dbl>, `Avg>2.5` <dbl>, `Avg<2.5` <dbl>, AHh <dbl>, …
summary(pl)
## Div Date Time
## Length:380 Length:380 Min. :12:00:00.000000
## Class :character Class :character 1st Qu.:15:00:00.000000
## Mode :character Mode :character Median :15:00:00.000000
## Mean :16:12:26.052632
## 3rd Qu.:17:30:00.000000
## Max. :20:15:00.000000
##
## HomeTeam AwayTeam FTHG FTAG
## Length:380 Length:380 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:0.000
## Mode :character Mode :character Median :1.000 Median :1.000
## Mean :1.634 Mean :1.218
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :9.000 Max. :6.000
##
## FTR HTHG HTAG HTR
## Length:380 Min. :0.0000 Min. :0.0000 Length:380
## Class :character 1st Qu.:0.0000 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.0000 Mode :character
## Mean :0.7579 Mean :0.5632
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :5.0000 Max. :3.0000
##
## Referee HS AS HST
## Length:380 Min. : 1.00 Min. : 1.00 Min. : 0.000
## Class :character 1st Qu.:10.00 1st Qu.: 8.00 1st Qu.: 3.000
## Mode :character Median :14.00 Median :11.00 Median : 5.000
## Mean :13.95 Mean :11.31 Mean : 4.908
## 3rd Qu.:17.00 3rd Qu.:15.00 3rd Qu.: 7.000
## Max. :33.00 Max. :30.00 Max. :15.000
##
## AST HF AF HC
## Min. : 0.000 Min. : 3.0 Min. : 3.00 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 8.0 1st Qu.: 8.00 1st Qu.: 3.000
## Median : 4.000 Median :10.0 Median :11.00 Median : 5.000
## Mean : 3.895 Mean :10.6 Mean :10.93 Mean : 5.637
## 3rd Qu.: 5.000 3rd Qu.:13.0 3rd Qu.:13.00 3rd Qu.: 8.000
## Max. :11.000 Max. :23.0 Max. :24.00 Max. :17.000
##
## AC HY AY HR
## Min. : 0.000 Min. :0.000 Min. :0.000 Min. :0.00000
## 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.00000
## Median : 4.000 Median :2.000 Median :2.000 Median :0.00000
## Mean : 4.471 Mean :1.671 Mean :1.916 Mean :0.04737
## 3rd Qu.: 6.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:0.00000
## Max. :19.000 Max. :6.000 Max. :7.000 Max. :2.00000
##
## AR B365H B365D B365A
## Min. :0.00000 Min. : 1.060 Min. : 3.000 Min. : 1.250
## 1st Qu.:0.00000 1st Qu.: 1.650 1st Qu.: 3.500 1st Qu.: 2.150
## Median :0.00000 Median : 2.250 Median : 3.750 Median : 3.200
## Mean :0.02632 Mean : 2.835 Mean : 4.158 Mean : 4.421
## 3rd Qu.:0.00000 3rd Qu.: 3.300 3rd Qu.: 4.330 3rd Qu.: 5.250
## Max. :1.00000 Max. :12.000 Max. :13.000 Max. :29.000
##
## BWH BWD BWA IWH
## Min. : 1.070 Min. : 3.000 Min. : 1.250 Min. : 1.070
## 1st Qu.: 1.650 1st Qu.: 3.400 1st Qu.: 2.150 1st Qu.: 1.650
## Median : 2.250 Median : 3.700 Median : 3.150 Median : 2.300
## Mean : 2.801 Mean : 4.144 Mean : 4.464 Mean : 2.821
## 3rd Qu.: 3.300 3rd Qu.: 4.348 3rd Qu.: 5.000 3rd Qu.: 3.300
## Max. :12.000 Max. :12.500 Max. :41.000 Max. :12.000
##
## IWD IWA PSH PSD
## Min. : 3.050 Min. : 1.250 Min. : 1.070 Min. : 3.110
## 1st Qu.: 3.450 1st Qu.: 2.200 1st Qu.: 1.680 1st Qu.: 3.550
## Median : 3.700 Median : 3.150 Median : 2.330 Median : 3.835
## Mean : 4.146 Mean : 4.352 Mean : 2.924 Mean : 4.290
## 3rd Qu.: 4.400 3rd Qu.: 5.250 3rd Qu.: 3.430 3rd Qu.: 4.490
## Max. :12.000 Max. :28.000 Max. :12.650 Max. :13.820
##
## PSA WHH WHD WHA
## Min. : 1.240 Min. : 1.050 Min. : 3.000 Min. : 1.240
## 1st Qu.: 2.217 1st Qu.: 1.645 1st Qu.: 3.400 1st Qu.: 2.150
## Median : 3.320 Median : 2.250 Median : 3.600 Median : 3.200
## Mean : 4.662 Mean : 2.844 Mean : 4.036 Mean : 4.534
## 3rd Qu.: 5.380 3rd Qu.: 3.300 3rd Qu.: 4.200 3rd Qu.: 5.062
## Max. :41.000 Max. :13.000 Max. :13.000 Max. :34.000
##
## VCH VCD VCA MaxH
## Min. : 1.060 Min. : 2.900 Min. : 1.220 Min. : 1.090
## 1st Qu.: 1.620 1st Qu.: 3.300 1st Qu.: 2.138 1st Qu.: 1.710
## Median : 2.250 Median : 3.600 Median : 3.165 Median : 2.380
## Mean : 2.859 Mean : 4.021 Mean : 4.626 Mean : 3.019
## 3rd Qu.: 3.300 3rd Qu.: 4.200 3rd Qu.: 5.250 3rd Qu.: 3.520
## Max. :13.000 Max. :11.500 Max. :31.000 Max. :13.500
##
## MaxD MaxA AvgH AvgD
## Min. : 3.230 Min. : 1.270 Min. : 1.070 Min. : 3.080
## 1st Qu.: 3.640 1st Qu.: 2.240 1st Qu.: 1.667 1st Qu.: 3.480
## Median : 3.950 Median : 3.325 Median : 2.300 Median : 3.760
## Mean : 4.454 Mean : 4.999 Mean : 2.872 Mean : 4.216
## 3rd Qu.: 4.662 3rd Qu.: 5.550 3rd Qu.: 3.368 3rd Qu.: 4.423
## Max. :16.000 Max. :50.000 Max. :12.480 Max. :12.840
##
## AvgA B365>2.5 B365<2.5 P>2.5
## Min. : 1.250 Min. :1.220 Min. :1.530 Min. :1.310
## 1st Qu.: 2.178 1st Qu.:1.660 1st Qu.:1.870 1st Qu.:1.680
## Median : 3.195 Median :1.860 Median :2.020 Median :1.860
## Mean : 4.567 Mean :1.833 Mean :2.096 Mean :1.858
## 3rd Qu.: 5.192 3rd Qu.:2.020 3rd Qu.:2.200 3rd Qu.:2.045
## Max. :37.570 Max. :2.500 Max. :4.330 Max. :2.550
## NA's :1
## P<2.5 Max>2.5 Max<2.5 Avg>2.5
## Min. :1.560 Min. :1.270 Min. :1.600 Min. :1.240
## 1st Qu.:1.860 1st Qu.:1.710 1st Qu.:1.900 1st Qu.:1.650
## Median :2.050 Median :1.885 Median :2.080 Median :1.820
## Mean :2.121 Mean :1.889 Mean :2.177 Mean :1.823
## 3rd Qu.:2.295 3rd Qu.:2.070 3rd Qu.:2.350 3rd Qu.:1.992
## Max. :3.460 Max. :2.560 Max. :4.330 Max. :2.470
## NA's :1
## Avg<2.5 AHh B365AHH B365AHA
## Min. :1.540 Min. :-3.0000 Min. :1.700 Min. :1.750
## 1st Qu.:1.830 1st Qu.:-0.7500 1st Qu.:1.880 1st Qu.:1.877
## Median :2.010 Median :-0.2500 Median :1.965 Median :1.940
## Mean :2.093 Mean :-0.2908 Mean :1.956 Mean :1.943
## 3rd Qu.:2.270 3rd Qu.: 0.2500 3rd Qu.:2.030 3rd Qu.:2.020
## Max. :4.070 Max. : 1.7500 Max. :2.160 Max. :2.140
##
## PAHH PAHA MaxAHH MaxAHA
## Min. :1.760 Min. :1.760 Min. :1.810 Min. :1.810
## 1st Qu.:1.890 1st Qu.:1.880 1st Qu.:1.920 1st Qu.:1.910
## Median :1.980 Median :1.930 Median :2.000 Median :1.980
## Mean :1.974 Mean :1.947 Mean :1.998 Mean :1.983
## 3rd Qu.:2.050 3rd Qu.:2.020 3rd Qu.:2.070 3rd Qu.:2.060
## Max. :2.190 Max. :2.190 Max. :2.200 Max. :2.190
##
## AvgAHH AvgAHA B365CH B365CD
## Min. :1.760 Min. :1.760 Min. : 1.080 Min. : 3.000
## 1st Qu.:1.870 1st Qu.:1.860 1st Qu.: 1.620 1st Qu.: 3.500
## Median :1.945 Median :1.920 Median : 2.250 Median : 3.750
## Mean :1.944 Mean :1.927 Mean : 2.818 Mean : 4.127
## 3rd Qu.:2.010 3rd Qu.:2.000 3rd Qu.: 3.300 3rd Qu.: 4.330
## Max. :2.130 Max. :2.130 Max. :13.000 Max. :12.000
##
## B365CA BWCH BWCD BWCA
## Min. : 1.220 Min. : 1.090 Min. : 2.950 Min. : 1.260
## 1st Qu.: 2.200 1st Qu.: 1.630 1st Qu.: 3.400 1st Qu.: 2.150
## Median : 3.150 Median : 2.250 Median : 3.700 Median : 3.150
## Mean : 4.312 Mean : 2.812 Mean : 4.081 Mean : 4.293
## 3rd Qu.: 5.250 3rd Qu.: 3.250 3rd Qu.: 4.330 3rd Qu.: 5.250
## Max. :23.000 Max. :11.500 Max. :12.000 Max. :26.000
##
## IWCH IWCD IWCA PSCH
## Min. : 1.080 Min. : 3.000 Min. : 1.250 Min. : 1.080
## 1st Qu.: 1.650 1st Qu.: 3.400 1st Qu.: 2.200 1st Qu.: 1.660
## Median : 2.300 Median : 3.650 Median : 3.150 Median : 2.325
## Mean : 2.829 Mean : 4.079 Mean : 4.262 Mean : 2.930
## 3rd Qu.: 3.300 3rd Qu.: 4.300 3rd Qu.: 5.250 3rd Qu.: 3.400
## Max. :11.000 Max. :12.000 Max. :24.000 Max. :12.000
##
## PSCD PSCA WHCH WHCD
## Min. : 3.050 Min. : 1.260 Min. : 1.060 Min. : 3.000
## 1st Qu.: 3.510 1st Qu.: 2.265 1st Qu.: 1.630 1st Qu.: 3.300
## Median : 3.815 Median : 3.270 Median : 2.250 Median : 3.600
## Mean : 4.269 Mean : 4.585 Mean : 2.871 Mean : 3.976
## 3rd Qu.: 4.420 3rd Qu.: 5.525 3rd Qu.: 3.325 3rd Qu.: 4.200
## Max. :13.500 Max. :36.000 Max. :12.000 Max. :11.000
##
## WHCA VCCH VCCD VCCA
## Min. : 1.240 Min. : 1.060 Min. : 2.880 Min. : 1.220
## 1st Qu.: 2.150 1st Qu.: 1.600 1st Qu.: 3.300 1st Qu.: 2.188
## Median : 3.200 Median : 2.250 Median : 3.600 Median : 3.200
## Mean : 4.499 Mean : 2.871 Mean : 4.054 Mean : 4.592
## 3rd Qu.: 5.250 3rd Qu.: 3.300 3rd Qu.: 4.200 3rd Qu.: 5.500
## Max. :34.000 Max. :13.000 Max. :13.000 Max. :41.000
##
## MaxCH MaxCD MaxCA AvgCH
## Min. : 1.090 Min. : 3.120 Min. : 1.280 Min. : 1.080
## 1st Qu.: 1.728 1st Qu.: 3.600 1st Qu.: 2.295 1st Qu.: 1.655
## Median : 2.420 Median : 3.915 Median : 3.435 Median : 2.315
## Mean : 3.094 Mean : 4.436 Mean : 4.988 Mean : 2.886
## 3rd Qu.: 3.620 3rd Qu.: 4.612 3rd Qu.: 5.812 3rd Qu.: 3.340
## Max. :14.400 Max. :15.000 Max. :41.000 Max. :12.150
##
## AvgCD AvgCA B365C>2.5 B365C<2.5
## Min. : 3.030 Min. : 1.250 Min. :1.250 Min. :1.440
## 1st Qu.: 3.460 1st Qu.: 2.208 1st Qu.:1.620 1st Qu.:1.857
## Median : 3.750 Median : 3.235 Median :1.895 Median :2.000
## Mean : 4.186 Mean : 4.492 Mean :1.842 Mean :2.103
## 3rd Qu.: 4.362 3rd Qu.: 5.340 3rd Qu.:2.040 3rd Qu.:2.300
## Max. :12.660 Max. :32.000 Max. :2.750 Max. :4.000
##
## PC>2.5 PC<2.5 MaxC>2.5 MaxC<2.5
## Min. :1.330 Min. :1.480 Min. :1.270 Min. :1.520
## 1st Qu.:1.680 1st Qu.:1.860 1st Qu.:1.718 1st Qu.:1.920
## Median :1.890 Median :2.010 Median :1.930 Median :2.100
## Mean :1.873 Mean :2.126 Mean :1.921 Mean :2.223
## 3rd Qu.:2.050 3rd Qu.:2.320 3rd Qu.:2.120 3rd Qu.:2.450
## Max. :2.860 Max. :3.450 Max. :2.870 Max. :4.550
## NA's :1 NA's :1
## AvgC>2.5 AvgC<2.5 AHCh B365CAHH
## Min. :1.240 Min. :1.470 Min. :-2.7500 Min. :1.650
## 1st Qu.:1.640 1st Qu.:1.837 1st Qu.:-0.7500 1st Qu.:1.880
## Median :1.850 Median :1.980 Median :-0.2500 Median :1.960
## Mean :1.837 Mean :2.101 Mean :-0.2816 Mean :1.953
## 3rd Qu.:2.002 3rd Qu.:2.290 3rd Qu.: 0.2500 3rd Qu.:2.040
## Max. :2.690 Max. :4.060 Max. : 1.7500 Max. :2.350
##
## B365CAHA PCAHH PCAHA MaxCAHH
## Min. :1.580 Min. :1.710 Min. :1.610 Min. :1.810
## 1st Qu.:1.870 1st Qu.:1.880 1st Qu.:1.880 1st Qu.:1.930
## Median :1.950 Median :1.960 Median :1.950 Median :2.010
## Mean :1.949 Mean :1.971 Mean :1.958 Mean :2.019
## 3rd Qu.:2.030 3rd Qu.:2.050 3rd Qu.:2.040 3rd Qu.:2.100
## Max. :2.200 Max. :2.480 Max. :2.270 Max. :2.520
##
## MaxCAHA AvgCAHH AvgCAHA TG
## Min. :1.650 Min. :1.760 Min. :1.600 Min. :0.000
## 1st Qu.:1.930 1st Qu.:1.860 1st Qu.:1.860 1st Qu.:2.000
## Median :2.010 Median :1.940 Median :1.930 Median :3.000
## Mean :2.011 Mean :1.943 Mean :1.933 Mean :2.853
## 3rd Qu.:2.090 3rd Qu.:2.020 3rd Qu.:2.010 3rd Qu.:4.000
## Max. :2.290 Max. :2.420 Max. :2.120 Max. :9.000
##
## YC RC
## Min. :0.000 Min. :0.00000
## 1st Qu.:2.000 1st Qu.:0.00000
## Median :3.000 Median :0.00000
## Mean :3.587 Mean :0.07368
## 3rd Qu.:5.000 3rd Qu.:0.00000
## Max. :9.000 Max. :2.00000
##
A couple more standard metrics
quantile(pl$TG)
## 0% 25% 50% 75% 100%
## 0 2 3 4 9
quantile(pl$YC)
## 0% 25% 50% 75% 100%
## 0 2 3 5 9
quantile(pl$HS)
## 0% 25% 50% 75% 100%
## 1 10 14 17 33
quantile(pl$AS)
## 0% 25% 50% 75% 100%
## 1 8 11 15 30
n_distinct(pl$Referee)
## [1] 23
Do any referees favor home teams?
win_rate_by_ref <- pl |>
group_by(Referee) |>
summarise(
matches = n(),
home_win_rate = mean(FTR == "H"),
away_win_rate = mean(FTR == "A")
) |>
filter(matches >=10) |>
arrange(desc(home_win_rate))
win_rate_by_ref
## # A tibble: 16 × 4
## Referee matches home_win_rate away_win_rate
## <chr> <int> <dbl> <dbl>
## 1 S Attwell 25 0.72 0.12
## 2 R Jones 26 0.654 0.269
## 3 P Bankes 21 0.619 0.0476
## 4 M Oliver 30 0.6 0.233
## 5 S Hooper 29 0.552 0.172
## 6 M Salisbury 15 0.533 0.333
## 7 P Tierney 30 0.5 0.3
## 8 J Brooks 19 0.474 0.211
## 9 A Madley 24 0.458 0.292
## 10 D England 18 0.444 0.278
## 11 D Coote 21 0.429 0.333
## 12 A Marriner 13 0.385 0.385
## 13 A Taylor 30 0.333 0.433
## 14 C Pawson 21 0.333 0.476
## 15 C Kavanagh 13 0.308 0.538
## 16 J Gillett 17 0.294 0.412
While comparing to the overall average
pl |>
summarise(
home_win_rate = mean(FTR == "H"),
away_win_rate = mean(FTR == "A")
)
## # A tibble: 1 × 2
## home_win_rate away_win_rate
## <dbl> <dbl>
## 1 0.484 0.287
ref_home <- pl |>
group_by(Referee)|>
summarise(
matches = n(),
home_win_rate = mean(FTR == "H")
) |>
filter(matches >= 10)
This graph shows that some referees give the home side a better chance of walking away with three points. Atwell, Jones, Bankes, and Oliver are the most significant names here.
ggplot(data = win_rate_by_ref, aes(x = Referee, home_win_rate)) +
geom_col() +
coord_flip() +
scale_y_continuous(limits = c(0, 1)) +
labs(
title = "Home win rate by referee",
x = "Referee",
y = "Home win rate"
)
Which Referees give out the most red cards? Yellow cards?
ggplot(data = cards, aes(x = Referee, avg_red_cards)) +
geom_col() +
coord_flip() +
labs(
title = "Red Cards per Match by Referee",
x = "Referee",
y = "Red Cards per Match"
)
I will have to explore Madley’s RC number more. Scott and Bond’s numbers are interesting.
ggplot(data = cards, aes(x = Referee, avg_yellow_cards)) +
geom_col() +
coord_flip() +
labs(
title = "Yellow Cards per Match by referee",
x = "Referee",
y = "Yellow Cards per Match"
)
cards |>
ggplot() +
geom_point(mapping = aes(x = avg_yellow_cards , y = avg_red_cards,
color = Referee)) +
labs(title = "Red and Yellow cards",
x = "Yellow cards", y = "Red cards") +
theme_classic()
yc_by_ref <- pl |>
filter(!is.na(Referee)) |>
group_by(Referee) |>
summarise(
matches = n(),
avg_home_yc = mean(HY),
avg_away_yc = mean(AY),
h_vs_a = avg_home_yc - avg_away_yc,
) |>
filter(matches >= 10)
yc_by_ref
## # A tibble: 16 × 5
## Referee matches avg_home_yc avg_away_yc h_vs_a
## <chr> <int> <dbl> <dbl> <dbl>
## 1 A Madley 24 1.42 2.04 -0.625
## 2 A Marriner 13 2.08 1.46 0.615
## 3 A Taylor 30 2 1.77 0.233
## 4 C Kavanagh 13 1 2 -1
## 5 C Pawson 21 2.05 1.71 0.333
## 6 D Coote 21 1.71 1.95 -0.238
## 7 D England 18 1.28 2.28 -1
## 8 J Brooks 19 2 2.26 -0.263
## 9 J Gillett 17 1.47 1.18 0.294
## 10 M Oliver 30 1.73 1.17 0.567
## 11 M Salisbury 15 2 2.33 -0.333
## 12 P Bankes 21 1.67 2.67 -1
## 13 P Tierney 30 1.9 1.9 0
## 14 R Jones 26 1.69 2.35 -0.654
## 15 S Attwell 25 1.28 1.96 -0.68
## 16 S Hooper 29 1.38 2.03 -0.655
Do any referees favor the home team when punishing players? Does crowd influence have an effect on referees?
ggplot(yc_by_ref, aes(x = reorder(Referee, h_vs_a), y = h_vs_a)) +
geom_col() +
coord_flip() +
labs(
title = "Avg yellow cards: Home minus Away(left more yellows for home, right away)",
x = "Referee",
y = "Avg Home Yellows - Avg Away Yellows (per match)"
)
Most referee’s do end up favoring the home team when giving out cards. Some people say that they succumb to the crowd, others may argue that away teams usually are on defense more as a tactical choice which leads to more fouls, as well as cards.
Histogram of home and away betting odds distubution
pl |>
ggplot(aes(x = B365H)) +
geom_histogram(bins = 10) +
labs(
title = "Histogram of Bet365 Home Win Odds",
x = "Decimal odds",
y = "Match count"
)
pl |>
ggplot(aes(x = B365A)) +
geom_histogram(bins = 5) +
labs(
title = "Histogram of Bet365 Away Win Odds",
x = "Decimal odds",
y = "Match count"
)
The histogram bins were hard to figure out. Most of the options made it unreadable, but you can still see the distribution of odds here.
The box plot showing how betting odds line up with results is a clearer look at the betting data. I will have to think of ways to Incorporate this part of the dataset.
pl |>
ggplot(aes(x = FTR, y = B365H, fill = FTR)) +
geom_boxplot() +
labs(title = "Distribution of home odds by match result",
x = "Result (Home win, Draw, or Away win)",
y = "Bet365 home odds")
.