### Thanks to Martin Eastwood
df <- read.csv("http://www.football-data.co.uk/mmz4281/1516/E0.csv")
## Format compatible with glms
df <- apply(df, 1, function(row){
data.frame(team=c(row['HomeTeam'], row['AwayTeam']),
opponent=c(row['AwayTeam'], row['HomeTeam']),
goals=c(row['FTHG'], row['FTAG']),
home=c(1, 0))
})
df <- do.call(rbind, df)
head(df)
## team opponent goals home
## HomeTeam Bournemouth Aston Villa 0 1
## AwayTeam Aston Villa Bournemouth 1 0
## HomeTeam1 Chelsea Swansea 2 1
## AwayTeam1 Swansea Chelsea 2 0
## HomeTeam2 Everton Watford 2 1
## AwayTeam2 Watford Everton 2 0
## numeric data not factors
df$goals <- as.numeric(as.character(df$goals))
## fit the GLM
model <- glm(goals ~ home + team + opponent,
family = poisson(link = log), data = df)
## Predictions Chelsea vs Arsenal
ave_home_goals <- predict(model, data.frame(home = 1, team = "Chelsea", opponent = "Arsenal", type = "response"))
ave_away_goals <- predict(model, data.frame(home = 0, team = "Arsenal", opponent = "Chelsea", type = "response"))
## Probabilities per goal
home_goals <- dpois(0:10, ave_home_goals)
away_goals <- dpois(0:10, ave_away_goals)
## Convert probability vectors into score matrix
score.matrix <- home_goals %o% away_goals
## Probabilities for home and away win
draw <- sum(diag(score.matrix))
away <- sum(score.matrix[upper.tri(score.matrix)])
home <- sum(score.matrix[lower.tri(score.matrix)])
cat("Draw probability =", draw, "\nAway probability =", away, "\nHome Probability = ", home)
## Draw probability = 0.5677174
## Away probability = 0.3101683
## Home Probability = 0.1221143