Excercise:

Walk through the code and try to understand it. Answer the following questions:

Plot the home advantage in both models
The model predicts the last 150 games:
- How many points would the model score in a game, where predicting the correct result would score 3 points, the correct winner 1 points.

This notebook is a tutorial on Hierarchical modelling using Stan which is adopted from https://github.com/MaggieLieu/STAN_tutorials/blob/master/Hierarchical/Hierarchical.Rmd. The model is also described in the paper: https://discovery.ucl.ac.uk/id/eprint/16040/1/16040.pdf

Modifications include:

Removing of the pooled model
Removing the fixed prior for home advantage (which has a std of about 1e-4)
- However this seems to require that as in the paper we need to use a sum-to-zero constraint for the defense and offense. This makes sense intuitively, since the teams play against each other and therefore these capabilities are relative to each other. For more information on this see the paper above and the references therein.

lsg = TRUE #Set to FALSE before submitting
require(rstan)

## Loading required package: rstan

## Loading required package: StanHeaders

## Loading required package: ggplot2

## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)

set.seed(1) #set seed

Reading the data

First we read in the data

data = read.csv('https://raw.githubusercontent.com/MaggieLieu/STAN_tutorials/master/Hierarchical/premiereleague.csv',col.names = c('Home','score1', 'score2', 'Away'), stringsAsFactors = FALSE)

data

ng = nrow(data)
cat('we have G =', ng, 'games \n')

## we have G = 328 games

nt = length(unique(data$Home))
cat('We have T = ', nt, 'teams \n')

## We have T =  20 teams

Traditional method

We will assume that the goals scored come from a poisson distribution

s1 | theta_g1 ~ Poisson(theta_g1) #game g score by home team s2 | theta_g2 ~ Poisson(theta_g2) #game g score by away team

Assuming a log-linear random effect model log(theta_g1) = home + att_ht + def_at log(theta_g2) = att_at + def_ht

where home is a constant for the advantage for the team hosting the game att and def are the attack and defence abilities of the teams where the indices at,ht correspond to the t=1-20 teams.

priors we willl use for the attack and defence abilities are very wide, essentially the teams’ performances are independent of each other home ~ normal(0,0.0001) att[t] ~ normal(0, 2) def[t] ~ normal(0, 2)

Now convert team names for each match into numbers

teams = unique(data$Home)
ht = unlist(sapply(1:ng, function(g) which(teams == data$Home[g])))
at = unlist(sapply(1:ng, function(g) which(teams == data$Away[g])))

# we will save the last 5 games to predict
#np=5
np=250
ngob = ng-np #number of games to fit
my_data = list(
  nt = nt, 
  ng = ngob,
  ht = ht[1:ngob], 
  at = at[1:ngob], 
  s1 = data$score1[1:ngob],
  s2 = data$score2[1:ngob],
  np = np,
  htnew = ht[(ngob+1):ng],
  atnew = at[(ngob+1):ng]
)

nhfit = stan(file = 'non_hier_model.stan', data = my_data)

Plot the predicted scores of the last 5 matches

nhparams = extract(nhfit)
pred_scores = c(colMeans(nhparams$s1new),colMeans(nhparams$s2new))
true_scores = c(data$score1[(ngob+1):ng],data$score2[(ngob+1):ng] )
plot(true_scores, pred_scores, xlim=c(0,5), ylim=c(0,5), pch=20, ylab='predicted scores', xlab='true scores')
abline(a=0,  b=1, lty='dashed')

pred_errors = c(sapply(1:np, function(x) sd(nhparams$s1new[,x])),sapply(1:np, function(x) sd(nhparams$s1new[,x])))
arrows(true_scores, pred_scores+pred_errors, true_scores, pred_scores-pred_errors, length = 0.05, angle = 90, code = 3, col=rgb(0,0,0,0.3))

mean((pred_scores - true_scores)^2)

## [1] 1.651242

We can also look at the attack/defense of the teams:

attack = colMeans(nhparams$att)
defense = colMeans(nhparams$def)

plot(attack,defense,xlim=c(-0.4,1.1), main='Non-Hierachical Model')
abline(h=0)
abline(v=0)
text(attack,defense, labels=teams, cex=0.7, pos=4)

plot(density(nhparams$home))

sd(nhparams$home)

## [1] 0.09839375

Hierarchical model

In a hierarchical model, the parameters of interest, in our case the attack and defense ability are drawn from the population distribution. att[t] ~ normal(mu_att, tau_att) def[t] ~ normal(mu_def, tau_def)

Instead we define priors on the population, known as the hyperpriors. mu_att = normal(0,0.0001) tau_att = gamma(0.1,0.1) mu_def = normal(0, 0.0001) tau_def = gamma(0.1,0.1)

hfit = stan(file = 'hier_model.stan', data = my_data)

## Warning: There were 224 divergent transitions after warmup. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.

## Warning: Examine the pairs() plot to diagnose sampling problems

## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess

## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess

pairs(hfit, pars=c('mu_att', 'tau_att', 'mu_def', 'tau_def'))

hparams = extract(hfit)
pred_scores = c(colMeans(hparams$s1new),colMeans(hparams$s2new))
pred_errors = c(sapply(1:np, function(x) sd(hparams$s1new[,x])),sapply(1:np, function(x) sd(hparams$s1new[,x])))
true_scores = c(data$score1[(ngob+1):ng],data$score2[(ngob+1):ng] )
plot(true_scores, pred_scores, xlim=c(0,5), ylim=c(0,5), pch=20, ylab='predicted scores', xlab='true scores')
abline(a=0,  b=1, lty='dashed')
arrows(true_scores, pred_scores+pred_errors, true_scores, pred_scores-pred_errors, length = 0.05, angle = 90, code = 3, rgb(0,0,0,0.3))

sqrt(mean((pred_scores - true_scores)^2))

## [1] 1.143771

attack = colMeans(hparams$att)
attacksd = sapply(1:nt, function(x) sd(hparams$att[,x]))
defense = colMeans(hparams$def)
defensesd = sapply(1:nt, function(x) sd(hparams$def[,x]))

plot(attack,defense, xlim=c(-0.4,1), ylim=c(-0.45,0.3), pch=20)
arrows(attack-attacksd, defense, attack+attacksd, defense, code=3, angle = 90, length = 0.04, col=rgb(0,0,0,0.2))
arrows(attack, defense-defensesd, attack, defense+defensesd, code=3, angle = 90, length = 0.04,col=rgb(0,0,0,0.2))
#abline(h=0)
#abline(v=0)
text(attack,defense, labels=teams, cex=0.7, adj=c(-0.05,-0.8) )

mean(attack)

## [1] -2.083023e-18

mean(defense)

## [1] -4.004772e-19

plot(density(hparams$home))

sd(hparams$home)

## [1] 0.09236712

mean(hparams$home)

## [1] 0.3756149

Hierarchical modelling tutorial

Excercise:

Reading the data

Traditional method

Plot the predicted scores of the last 5 matches

Hierarchical model