We are going to analyse various data point for England Championship 19/20 season so as to gain insight on how teams performed in different metrics as while as building block for championship model to use in 20/21 season starting 12th of September 2020.

*let load our first dataset

data<-read.csv("champ.csv",header = TRUE)

head(data,n=10)
##    Div       Date         HomeTeam       AwayTeam FTHG FTAG FTR HTHG HTAG HTR
## 1   E1 03/08/2018          Reading          Derby    1    2   A    0    0   D
## 2   E1 04/08/2018       Birmingham        Norwich    2    2   D    0    0   D
## 3   E1 04/08/2018        Brentford      Rotherham    5    1   H    2    0   H
## 4   E1 04/08/2018     Bristol City  Nott'm Forest    1    1   D    1    0   H
## 5   E1 04/08/2018          Ipswich      Blackburn    2    2   D    1    2   A
## 6   E1 04/08/2018         Millwall  Middlesbrough    2    2   D    2    0   H
## 7   E1 04/08/2018          Preston            QPR    1    0   H    0    0   D
## 8   E1 04/08/2018 Sheffield United        Swansea    1    2   A    0    0   D
## 9   E1 04/08/2018        West Brom         Bolton    1    2   A    1    1   D
## 10  E1 04/08/2018            Wigan Sheffield Weds    3    2   H    2    1   H
##         Referee HS AS HST AST HF AF HC AC HY AY HR AR B365H B365D B365A  BWH
## 1       R Jones  8 11   5   3 14 16  2  4  3  4  0  0  3.60  3.30  2.25 3.50
## 2      P Bankes 16 16   7   7 15 10  4  4  0  3  0  0  2.60  3.30  3.00 2.40
## 3    O Langford 14  9  10   5 15 10 13  3  0  2  0  0  1.50  4.75  7.00 1.48
## 4        D Bond 18 16   6   6  8 17  7  9  0  4  0  0  2.60  3.40  2.87 2.50
## 5      A Davies  9  6   3   2 11 21  3  0  2  3  0  0  2.75  3.25  2.80 2.50
## 6      K Friend  9 12   2   3 13  6  6  6  3  2  0  0  2.75  3.20  2.87 2.70
## 7     D England 15  8   3   2 15 13  6  4  0  2  0  0  1.90  3.60  4.50 1.95
## 8     J Simpson 12  9   2   4  6  6  5  1  0  0  0  0  2.25  3.40  3.50 2.30
## 9  G Eltringham 20 11   3   4  9 14 10  6  1  0  0  0  1.44  4.50  9.00 1.44
## 10   T Robinson 21  8  10   4 12 12  3  3  1  3  0  1  2.20  3.40  3.60 2.20
##     BWD  BWA  IWH  IWD  IWA  PSH  PSD  PSA  WHH  WHD  WHA  VCH  VCD  VCA Bb1X2
## 1  3.20 2.20 3.40 3.15 2.15 3.61 3.33 2.23 3.50 3.20 2.20 3.60 3.30 2.25    41
## 2  3.25 2.90 2.45 3.05 2.85 2.61 3.20 3.04 2.50 3.20 2.88 2.55 3.30 3.00    40
## 3  4.10 6.50 1.48 4.25 5.90 1.52 4.45 6.92 1.50 4.40 5.80 1.50 4.75 6.50    38
## 4  3.25 2.75 2.45 3.15 2.80 2.57 3.29 3.00 2.50 3.25 2.80 2.55 3.40 3.00    41
## 5  3.25 2.75 2.60 3.05 2.70 2.80 3.17 2.83 2.75 3.00 2.70 2.75 3.25 2.80    40
## 6  3.10 2.65 2.65 2.95 2.70 2.84 3.10 2.85 2.70 3.10 2.70 2.80 3.20 2.80    41
## 7  3.30 3.90 1.85 3.35 4.10 1.93 3.53 4.49 1.85 3.60 4.00 1.95 3.60 4.20    41
## 8  3.20 3.25 2.20 3.10 3.30 2.26 3.30 3.56 2.25 3.30 3.20 2.25 3.40 3.50    41
## 9  4.20 7.50 1.45 4.20 6.80 1.45 4.53 8.52 1.40 4.50 7.50 1.45 4.50 8.00    41
## 10 3.25 3.30 2.15 3.10 3.35 2.23 3.28 3.66 2.20 3.30 3.30 2.25 3.40 3.50    41
##    BbMxH BbAvH BbMxD BbAvD BbMxA BbAvA BbOU BbMx.2.5 BbAv.2.5 BbMx.2.5.1
## 1   3.75  3.44  3.36  3.23  2.28  2.21   38     2.25     2.18       1.73
## 2   2.65  2.51  3.30  3.16  3.04  2.90   35     2.35     2.21       1.70
## 3   1.55  1.50  4.75  4.33  7.08  6.32   35     1.70     1.64       2.26
## 4   2.66  2.53  3.40  3.23  3.00  2.83   38     2.13     2.02       1.83
## 5   2.80  2.68  3.25  3.12  2.83  2.74   36     2.28     2.20       1.70
## 6   2.85  2.74  3.20  3.05  2.87  2.73   35     2.46     2.32       1.63
## 7   1.96  1.91  3.60  3.43  4.50  4.09   38     2.13     2.02       1.84
## 8   2.30  2.23  3.40  3.22  3.56  3.36   37     2.29     2.19       1.72
## 9   1.50  1.44  4.53  4.30  9.00  7.54   39     2.00     1.89       1.96
## 10  2.25  2.20  3.40  3.23  3.66  3.41   37     2.25     2.18       1.72
##    BbAv.2.5.1 BbAH BbAHh BbMxAHH BbAvAHH BbMxAHA BbAvAHA PSCH PSCD PSCA
## 1        1.67   20  0.25    2.01    1.96    1.95    1.88 3.78 3.29 2.19
## 2        1.64   19 -0.25    2.23    2.15    1.77    1.72 2.86 3.14 2.79
## 3        2.21   22 -1.00    1.89    1.83    2.09    2.01 1.56 4.38 6.30
## 4        1.77   19 -0.25    2.25    2.16    1.77    1.71 2.77 3.33 2.74
## 5        1.65   18 -0.25    2.36    2.28    1.69    1.65 3.23 3.15 2.51
## 6        1.58   19  0.00    1.97    1.92    1.98    1.92 2.75 3.05 2.99
## 7        1.77   18 -0.50    1.97    1.92    1.99    1.93 1.97 3.58 4.18
## 8        1.66   20 -0.25    1.97    1.92    1.98    1.92 1.99 3.48 4.23
## 9        1.89   21 -1.00    1.80    1.75    2.18    2.10 1.43 4.39 9.84
## 10       1.66   20 -0.25    1.95    1.89    2.02    1.96 2.11 3.36 3.93

data contains so many columns which we are not interested in so we filter only data point of value for this project using library tidyverse

library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
EL1<-select(data,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,AC,PSH,PSD,PSA)
EL1<-na.omit(EL1)
head(EL1)
##       HomeTeam      AwayTeam FTHG FTAG FTR HTHG HTAG HTR HS AS HST AST HC AC
## 1      Reading         Derby    1    2   A    0    0   D  8 11   5   3  2  4
## 2   Birmingham       Norwich    2    2   D    0    0   D 16 16   7   7  4  4
## 3    Brentford     Rotherham    5    1   H    2    0   H 14  9  10   5 13  3
## 4 Bristol City Nott'm Forest    1    1   D    1    0   H 18 16   6   6  7  9
## 5      Ipswich     Blackburn    2    2   D    1    2   A  9  6   3   2  3  0
## 6     Millwall Middlesbrough    2    2   D    2    0   H  9 12   2   3  6  6
##    PSH  PSD  PSA
## 1 3.61 3.33 2.23
## 2 2.61 3.20 3.04
## 3 1.52 4.45 6.92
## 4 2.57 3.29 3.00
## 5 2.80 3.17 2.83
## 6 2.84 3.10 2.85

*This leaves us with only 17 columns to work with instead of previous 62 and first thing we’re interested in is how does full time result varies home, away and draw

# Actual number of Home, Away win and Draw
table(EL1$FTR)
## 
##   A   D   H 
## 149 162 239
# As shown below this translate to 43.45% Home win, 27% Away and 29.45% for draw 
# Now that we know how teams performs in terms of result let find how scoring varies home and away 
HomeGoal<-sum(EL1$FTHG)
AwayGoal<-sum(EL1$FTAG)
print(HomeGoal)
## [1] 835
print(AwayGoal)
## [1] 636
# Home teams outscored away teams by 199 goals meaning home teams in England Championship scored 31% more goals than road side

Efficiency of closing odds

However their is bookmaker over-round which we should consider as probabilities for all outcome won’t add up to 1, instead it will be slightly over, therefore a theoretical outcome probability needs to be calculated by adjusting over-round forming two prediction raw prediction straight from the odds and theoretical adjusting for the over-round *

Rawprob<-select(EL1,FTR,PSH,PSD,PSA)%>%mutate(HomeProb=1/PSH,AwayProb=1/PSA,DrawProb=1/PSD,OverRund=HomeProb+AwayProb+DrawProb,HomeTheo=HomeProb/OverRund,AwayTheo=AwayProb/OverRund,DrawTheo=DrawProb/OverRund)
str(Rawprob)
## 'data.frame':    550 obs. of  11 variables:
##  $ FTR     : Factor w/ 3 levels "A","D","H": 1 2 3 2 2 2 3 1 1 3 ...
##  $ PSH     : num  3.61 2.61 1.52 2.57 2.8 2.84 1.93 2.26 1.45 2.23 ...
##  $ PSD     : num  3.33 3.2 4.45 3.29 3.17 3.1 3.53 3.3 4.53 3.28 ...
##  $ PSA     : num  2.23 3.04 6.92 3 2.83 2.85 4.49 3.56 8.52 3.66 ...
##  $ HomeProb: num  0.277 0.383 0.658 0.389 0.357 ...
##  $ AwayProb: num  0.448 0.329 0.145 0.333 0.353 ...
##  $ DrawProb: num  0.3 0.312 0.225 0.304 0.315 ...
##  $ OverRund: num  1.03 1.02 1.03 1.03 1.03 ...
##  $ HomeTheo: num  0.27 0.374 0.641 0.379 0.348 ...
##  $ AwayTheo: num  0.437 0.321 0.141 0.325 0.344 ...
##  $ DrawTheo: num  0.293 0.305 0.219 0.296 0.307 ...
Rawprob%>%select(HomeProb,AwayProb,DrawProb)%>%apply(.,1,which.max)->RawPred
table(RawPred)
## RawPred
##   1   2 
## 389 161
# This shows that pinnacle closing odds favored home teams 389 and away team 161 as favorite

#Now let find how many times did pinnacle closing odds correctly predicted Home win

H_Pred<-filter(Rawprob,FTR=="H" & HomeProb > AwayProb)
str(H_Pred)
## 'data.frame':    192 obs. of  11 variables:
##  $ FTR     : Factor w/ 3 levels "A","D","H": 3 3 3 3 3 3 3 3 3 3 ...
##  $ PSH     : num  1.52 1.93 2.23 2.26 2.02 1.69 1.68 2.37 2.45 1.43 ...
##  $ PSD     : num  4.45 3.53 3.28 3.28 3.39 3.76 3.8 3.39 3.29 4.84 ...
##  $ PSA     : num  6.92 4.49 3.66 3.6 4.25 5.91 6.04 3.24 3.18 8.34 ...
##  $ HomeProb: num  0.658 0.518 0.448 0.442 0.495 ...
##  $ AwayProb: num  0.145 0.223 0.273 0.278 0.235 ...
##  $ DrawProb: num  0.225 0.283 0.305 0.305 0.295 ...
##  $ OverRund: num  1.03 1.02 1.03 1.03 1.03 ...
##  $ HomeTheo: num  0.641 0.506 0.437 0.432 0.483 ...
##  $ AwayTheo: num  0.141 0.217 0.266 0.271 0.229 ...
##  $ DrawTheo: num  0.219 0.277 0.297 0.297 0.288 ...
#Thus pinnacle closing odds predicted 192 Home win correctly out of 389 home favorite representing 49.36% accuracy

Away Predicted win by pinnacle closing odds

A_Pred<-filter(Rawprob,FTR=="A" & AwayProb > HomeProb)
str(A_Pred)
## 'data.frame':    64 obs. of  11 variables:
##  $ FTR     : Factor w/ 3 levels "A","D","H": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PSH     : num  3.61 2.86 2.93 3.1 4.15 2.83 2.89 2.92 3.27 3.07 ...
##  $ PSD     : num  3.33 3.34 3.23 3.38 3.36 3.46 3.29 3.2 3.27 3.3 ...
##  $ PSA     : num  2.23 2.65 2.68 2.46 2.04 2.6 2.63 2.67 2.42 2.52 ...
##  $ HomeProb: num  0.277 0.35 0.341 0.323 0.241 ...
##  $ AwayProb: num  0.448 0.377 0.373 0.407 0.49 ...
##  $ DrawProb: num  0.3 0.299 0.31 0.296 0.298 ...
##  $ OverRund: num  1.03 1.03 1.02 1.02 1.03 ...
##  $ HomeTheo: num  0.27 0.341 0.333 0.315 0.234 ...
##  $ AwayTheo: num  0.437 0.368 0.364 0.397 0.476 ...
##  $ DrawTheo: num  0.293 0.292 0.302 0.289 0.289 ...
# Pinnacle closing odds predicted 64 away games correct out of 162 favorite accuracy of 39.5%