We are going to analyse various data point for England Championship 19/20 season so as to gain insight on how teams performed in different metrics as while as building block for championship model to use in 20/21 season starting 12th of September 2020.
*let load our first dataset
data<-read.csv("champ.csv",header = TRUE)
head(data,n=10)
## Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR
## 1 E1 03/08/2018 Reading Derby 1 2 A 0 0 D
## 2 E1 04/08/2018 Birmingham Norwich 2 2 D 0 0 D
## 3 E1 04/08/2018 Brentford Rotherham 5 1 H 2 0 H
## 4 E1 04/08/2018 Bristol City Nott'm Forest 1 1 D 1 0 H
## 5 E1 04/08/2018 Ipswich Blackburn 2 2 D 1 2 A
## 6 E1 04/08/2018 Millwall Middlesbrough 2 2 D 2 0 H
## 7 E1 04/08/2018 Preston QPR 1 0 H 0 0 D
## 8 E1 04/08/2018 Sheffield United Swansea 1 2 A 0 0 D
## 9 E1 04/08/2018 West Brom Bolton 1 2 A 1 1 D
## 10 E1 04/08/2018 Wigan Sheffield Weds 3 2 H 2 1 H
## Referee HS AS HST AST HF AF HC AC HY AY HR AR B365H B365D B365A BWH
## 1 R Jones 8 11 5 3 14 16 2 4 3 4 0 0 3.60 3.30 2.25 3.50
## 2 P Bankes 16 16 7 7 15 10 4 4 0 3 0 0 2.60 3.30 3.00 2.40
## 3 O Langford 14 9 10 5 15 10 13 3 0 2 0 0 1.50 4.75 7.00 1.48
## 4 D Bond 18 16 6 6 8 17 7 9 0 4 0 0 2.60 3.40 2.87 2.50
## 5 A Davies 9 6 3 2 11 21 3 0 2 3 0 0 2.75 3.25 2.80 2.50
## 6 K Friend 9 12 2 3 13 6 6 6 3 2 0 0 2.75 3.20 2.87 2.70
## 7 D England 15 8 3 2 15 13 6 4 0 2 0 0 1.90 3.60 4.50 1.95
## 8 J Simpson 12 9 2 4 6 6 5 1 0 0 0 0 2.25 3.40 3.50 2.30
## 9 G Eltringham 20 11 3 4 9 14 10 6 1 0 0 0 1.44 4.50 9.00 1.44
## 10 T Robinson 21 8 10 4 12 12 3 3 1 3 0 1 2.20 3.40 3.60 2.20
## BWD BWA IWH IWD IWA PSH PSD PSA WHH WHD WHA VCH VCD VCA Bb1X2
## 1 3.20 2.20 3.40 3.15 2.15 3.61 3.33 2.23 3.50 3.20 2.20 3.60 3.30 2.25 41
## 2 3.25 2.90 2.45 3.05 2.85 2.61 3.20 3.04 2.50 3.20 2.88 2.55 3.30 3.00 40
## 3 4.10 6.50 1.48 4.25 5.90 1.52 4.45 6.92 1.50 4.40 5.80 1.50 4.75 6.50 38
## 4 3.25 2.75 2.45 3.15 2.80 2.57 3.29 3.00 2.50 3.25 2.80 2.55 3.40 3.00 41
## 5 3.25 2.75 2.60 3.05 2.70 2.80 3.17 2.83 2.75 3.00 2.70 2.75 3.25 2.80 40
## 6 3.10 2.65 2.65 2.95 2.70 2.84 3.10 2.85 2.70 3.10 2.70 2.80 3.20 2.80 41
## 7 3.30 3.90 1.85 3.35 4.10 1.93 3.53 4.49 1.85 3.60 4.00 1.95 3.60 4.20 41
## 8 3.20 3.25 2.20 3.10 3.30 2.26 3.30 3.56 2.25 3.30 3.20 2.25 3.40 3.50 41
## 9 4.20 7.50 1.45 4.20 6.80 1.45 4.53 8.52 1.40 4.50 7.50 1.45 4.50 8.00 41
## 10 3.25 3.30 2.15 3.10 3.35 2.23 3.28 3.66 2.20 3.30 3.30 2.25 3.40 3.50 41
## BbMxH BbAvH BbMxD BbAvD BbMxA BbAvA BbOU BbMx.2.5 BbAv.2.5 BbMx.2.5.1
## 1 3.75 3.44 3.36 3.23 2.28 2.21 38 2.25 2.18 1.73
## 2 2.65 2.51 3.30 3.16 3.04 2.90 35 2.35 2.21 1.70
## 3 1.55 1.50 4.75 4.33 7.08 6.32 35 1.70 1.64 2.26
## 4 2.66 2.53 3.40 3.23 3.00 2.83 38 2.13 2.02 1.83
## 5 2.80 2.68 3.25 3.12 2.83 2.74 36 2.28 2.20 1.70
## 6 2.85 2.74 3.20 3.05 2.87 2.73 35 2.46 2.32 1.63
## 7 1.96 1.91 3.60 3.43 4.50 4.09 38 2.13 2.02 1.84
## 8 2.30 2.23 3.40 3.22 3.56 3.36 37 2.29 2.19 1.72
## 9 1.50 1.44 4.53 4.30 9.00 7.54 39 2.00 1.89 1.96
## 10 2.25 2.20 3.40 3.23 3.66 3.41 37 2.25 2.18 1.72
## BbAv.2.5.1 BbAH BbAHh BbMxAHH BbAvAHH BbMxAHA BbAvAHA PSCH PSCD PSCA
## 1 1.67 20 0.25 2.01 1.96 1.95 1.88 3.78 3.29 2.19
## 2 1.64 19 -0.25 2.23 2.15 1.77 1.72 2.86 3.14 2.79
## 3 2.21 22 -1.00 1.89 1.83 2.09 2.01 1.56 4.38 6.30
## 4 1.77 19 -0.25 2.25 2.16 1.77 1.71 2.77 3.33 2.74
## 5 1.65 18 -0.25 2.36 2.28 1.69 1.65 3.23 3.15 2.51
## 6 1.58 19 0.00 1.97 1.92 1.98 1.92 2.75 3.05 2.99
## 7 1.77 18 -0.50 1.97 1.92 1.99 1.93 1.97 3.58 4.18
## 8 1.66 20 -0.25 1.97 1.92 1.98 1.92 1.99 3.48 4.23
## 9 1.89 21 -1.00 1.80 1.75 2.18 2.10 1.43 4.39 9.84
## 10 1.66 20 -0.25 1.95 1.89 2.02 1.96 2.11 3.36 3.93
data contains so many columns which we are not interested in so we filter only data point of value for this project using library tidyverse
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
EL1<-select(data,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,AC,PSH,PSD,PSA)
EL1<-na.omit(EL1)
head(EL1)
## HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR HS AS HST AST HC AC
## 1 Reading Derby 1 2 A 0 0 D 8 11 5 3 2 4
## 2 Birmingham Norwich 2 2 D 0 0 D 16 16 7 7 4 4
## 3 Brentford Rotherham 5 1 H 2 0 H 14 9 10 5 13 3
## 4 Bristol City Nott'm Forest 1 1 D 1 0 H 18 16 6 6 7 9
## 5 Ipswich Blackburn 2 2 D 1 2 A 9 6 3 2 3 0
## 6 Millwall Middlesbrough 2 2 D 2 0 H 9 12 2 3 6 6
## PSH PSD PSA
## 1 3.61 3.33 2.23
## 2 2.61 3.20 3.04
## 3 1.52 4.45 6.92
## 4 2.57 3.29 3.00
## 5 2.80 3.17 2.83
## 6 2.84 3.10 2.85
*This leaves us with only 17 columns to work with instead of previous 62 and first thing we’re interested in is how does full time result varies home, away and draw
# Actual number of Home, Away win and Draw
table(EL1$FTR)
##
## A D H
## 149 162 239
# As shown below this translate to 43.45% Home win, 27% Away and 29.45% for draw
# Now that we know how teams performs in terms of result let find how scoring varies home and away
HomeGoal<-sum(EL1$FTHG)
AwayGoal<-sum(EL1$FTAG)
print(HomeGoal)
## [1] 835
print(AwayGoal)
## [1] 636
# Home teams outscored away teams by 199 goals meaning home teams in England Championship scored 31% more goals than road side
Efficiency of closing odds
However their is bookmaker over-round which we should consider as probabilities for all outcome won’t add up to 1, instead it will be slightly over, therefore a theoretical outcome probability needs to be calculated by adjusting over-round forming two prediction raw prediction straight from the odds and theoretical adjusting for the over-round *
Rawprob<-select(EL1,FTR,PSH,PSD,PSA)%>%mutate(HomeProb=1/PSH,AwayProb=1/PSA,DrawProb=1/PSD,OverRund=HomeProb+AwayProb+DrawProb,HomeTheo=HomeProb/OverRund,AwayTheo=AwayProb/OverRund,DrawTheo=DrawProb/OverRund)
str(Rawprob)
## 'data.frame': 550 obs. of 11 variables:
## $ FTR : Factor w/ 3 levels "A","D","H": 1 2 3 2 2 2 3 1 1 3 ...
## $ PSH : num 3.61 2.61 1.52 2.57 2.8 2.84 1.93 2.26 1.45 2.23 ...
## $ PSD : num 3.33 3.2 4.45 3.29 3.17 3.1 3.53 3.3 4.53 3.28 ...
## $ PSA : num 2.23 3.04 6.92 3 2.83 2.85 4.49 3.56 8.52 3.66 ...
## $ HomeProb: num 0.277 0.383 0.658 0.389 0.357 ...
## $ AwayProb: num 0.448 0.329 0.145 0.333 0.353 ...
## $ DrawProb: num 0.3 0.312 0.225 0.304 0.315 ...
## $ OverRund: num 1.03 1.02 1.03 1.03 1.03 ...
## $ HomeTheo: num 0.27 0.374 0.641 0.379 0.348 ...
## $ AwayTheo: num 0.437 0.321 0.141 0.325 0.344 ...
## $ DrawTheo: num 0.293 0.305 0.219 0.296 0.307 ...
Rawprob%>%select(HomeProb,AwayProb,DrawProb)%>%apply(.,1,which.max)->RawPred
table(RawPred)
## RawPred
## 1 2
## 389 161
# This shows that pinnacle closing odds favored home teams 389 and away team 161 as favorite
#Now let find how many times did pinnacle closing odds correctly predicted Home win
H_Pred<-filter(Rawprob,FTR=="H" & HomeProb > AwayProb)
str(H_Pred)
## 'data.frame': 192 obs. of 11 variables:
## $ FTR : Factor w/ 3 levels "A","D","H": 3 3 3 3 3 3 3 3 3 3 ...
## $ PSH : num 1.52 1.93 2.23 2.26 2.02 1.69 1.68 2.37 2.45 1.43 ...
## $ PSD : num 4.45 3.53 3.28 3.28 3.39 3.76 3.8 3.39 3.29 4.84 ...
## $ PSA : num 6.92 4.49 3.66 3.6 4.25 5.91 6.04 3.24 3.18 8.34 ...
## $ HomeProb: num 0.658 0.518 0.448 0.442 0.495 ...
## $ AwayProb: num 0.145 0.223 0.273 0.278 0.235 ...
## $ DrawProb: num 0.225 0.283 0.305 0.305 0.295 ...
## $ OverRund: num 1.03 1.02 1.03 1.03 1.03 ...
## $ HomeTheo: num 0.641 0.506 0.437 0.432 0.483 ...
## $ AwayTheo: num 0.141 0.217 0.266 0.271 0.229 ...
## $ DrawTheo: num 0.219 0.277 0.297 0.297 0.288 ...
#Thus pinnacle closing odds predicted 192 Home win correctly out of 389 home favorite representing 49.36% accuracy
A_Pred<-filter(Rawprob,FTR=="A" & AwayProb > HomeProb)
str(A_Pred)
## 'data.frame': 64 obs. of 11 variables:
## $ FTR : Factor w/ 3 levels "A","D","H": 1 1 1 1 1 1 1 1 1 1 ...
## $ PSH : num 3.61 2.86 2.93 3.1 4.15 2.83 2.89 2.92 3.27 3.07 ...
## $ PSD : num 3.33 3.34 3.23 3.38 3.36 3.46 3.29 3.2 3.27 3.3 ...
## $ PSA : num 2.23 2.65 2.68 2.46 2.04 2.6 2.63 2.67 2.42 2.52 ...
## $ HomeProb: num 0.277 0.35 0.341 0.323 0.241 ...
## $ AwayProb: num 0.448 0.377 0.373 0.407 0.49 ...
## $ DrawProb: num 0.3 0.299 0.31 0.296 0.298 ...
## $ OverRund: num 1.03 1.03 1.02 1.02 1.03 ...
## $ HomeTheo: num 0.27 0.341 0.333 0.315 0.234 ...
## $ AwayTheo: num 0.437 0.368 0.364 0.397 0.476 ...
## $ DrawTheo: num 0.293 0.292 0.302 0.289 0.289 ...
# Pinnacle closing odds predicted 64 away games correct out of 162 favorite accuracy of 39.5%