This is a dataset that contains the result of every premier league (EPL) match between the 2009/10 and 2018/19 seasons. What we want to turn this into is a set of data showing what each teamโs average points total is over that span versus what it is when certain referees are involved.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
EPL <- read.csv("https://raw.githubusercontent.com/mkollontai/R_Bridge_FinalProject/master/EPL_09_19_Stats.csv")
head(EPL)
## Div Date X HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR
## 1 E0 2009-08-15 NA Aston Villa Wigan 0 2 A 0 1 A
## 2 E0 2009-08-15 NA Blackburn Man City 0 2 A 0 1 A
## 3 E0 2009-08-15 NA Bolton Sunderland 0 1 A 0 1 A
## 4 E0 2009-08-15 NA Chelsea Hull 2 1 H 1 1 D
## 5 E0 2009-08-15 NA Everton Arsenal 1 6 A 0 3 A
## 6 E0 2009-08-15 NA Portsmouth Fulham 0 1 A 0 1 A
## Referee HS AS HST AST HF AF HC AC HY AY HR AR
## 1 M Clattenburg 11 14 5 7 15 14 4 6 2 2 0 0
## 2 M Dean 17 8 9 5 12 9 5 4 2 1 0 0
## 3 A Marriner 11 20 3 13 16 10 4 7 2 1 0 0
## 4 A Wiley 26 7 12 3 13 15 12 4 1 2 0 0
## 5 M Halsey 8 15 5 9 11 13 4 9 0 0 0 0
## 6 M Atkinson 16 9 4 3 11 18 6 4 3 2 0 0
#Pull the subset of the data that only contains teams involved, the result and the referee
EPL2 <- EPL %>%
subset(select = c(HomeTeam, AwayTeam, FTR, Referee))
head(EPL2)
## HomeTeam AwayTeam FTR Referee
## 1 Aston Villa Wigan A M Clattenburg
## 2 Blackburn Man City A M Dean
## 3 Bolton Sunderland A A Marriner
## 4 Chelsea Hull H A Wiley
## 5 Everton Arsenal A M Halsey
## 6 Portsmouth Fulham A M Atkinson
#Rename the FTR column FinalVictor
names(EPL2)[names(EPL2) == "FTR"] <- "FinalVictor"
#Define 2 functions that caluclate the points earned by both teams involved
HomePtsFxn <- function(x)
{
if (x == "D")
{1}
else {
if (x == "H")
{3}
else
{0}
}
}
AwayPtsFxn <- function(x)
{
if (x == "D")
{1}
else {
if (x == "H")
{0}
else
{3}
}
}
#Add columns for home team and away team points earned
EPL2$HomePoints <- mapply(HomePtsFxn,EPL2$FinalVictor)
EPL2$AwayPoints <- mapply(AwayPtsFxn,EPL2$FinalVictor)
#Create subsets of only home data for both Arsenal and Liverpool (Since away games can be more unpredictable)
assign(paste("ArsenalHomeData"),EPL2[which(EPL2$HomeTeam == "Arsenal"),])
assign(paste("ArsenalHomeDeanData"), EPL2[which( (EPL2$HomeTeam == "Arsenal") & (EPL2$Referee == "M Dean")),])
assign(paste("LiverpoolHomeData"),EPL2[which(EPL2$HomeTeam == "Liverpool"),])
assign(paste("LiverpoolHomeMasonData"), EPL2[which((EPL2$HomeTeam == "Liverpool") & (EPL2$Referee == "L Mason")),])
ArsenalHomeData <- dplyr::select(ArsenalHomeData,AwayTeam,Referee,HomePoints)
ArsenalHomeDeanData <- dplyr::select(ArsenalHomeDeanData,AwayTeam,Referee,HomePoints)
LiverpoolHomeData <- dplyr::select(LiverpoolHomeData,AwayTeam,Referee,HomePoints)
LiverpoolHomeMasonData <- dplyr::select(LiverpoolHomeMasonData,AwayTeam,Referee,HomePoints)
mean(ArsenalHomeData$HomePoints)
## [1] 2.231579
#Arsenal's average point haul at home is 2.23/game
ArsenalHomeData %>% group_by(Referee) %>%
summarise(mean = mean(HomePoints), n = n())
## # A tibble: 25 x 3
## Referee mean n
## <fct> <dbl> <int>
## 1 A Marriner 1.95 20
## 2 A Taylor 2.24 17
## 3 A Wiley 3 1
## 4 C Foy 2.44 9
## 5 C Kavanagh 3 3
## 6 C Pawson 3 2
## 7 G Scott 3 4
## 8 H Webb 2.67 6
## 9 J Moss 2.73 11
## 10 K Friend 2.29 7
## # ... with 15 more rows
mean(LiverpoolHomeData$HomePoints)
## [1] 2.078947
#Liverpool's average point haul at home was 2.08
LiverpoolHomeData %>% group_by(Referee) %>%
summarise(mean = mean(HomePoints), n = n())
## # A tibble: 24 x 3
## Referee mean n
## <fct> <dbl> <int>
## 1 A Marriner 2 15
## 2 A Taylor 2.27 15
## 3 A Wiley 1.5 2
## 4 C Kavanagh 3 3
## 5 C Pawson 2.56 9
## 6 G Scott 3 3
## 7 H Webb 1.62 8
## 8 J Moss 1.5 8
## 9 K Friend 2.29 14
## 10 L Mason 1.89 9
## # ... with 14 more rows