Excercise:

Walk through the code and try to understand it. Answer the following questions:

This notebook is a tutorial on Hierarchical modelling using Stan which is adopted from https://github.com/MaggieLieu/STAN_tutorials/blob/master/Hierarchical/Hierarchical.Rmd. The model is also described in the paper: https://discovery.ucl.ac.uk/id/eprint/16040/1/16040.pdf

Modifications include:

lsg = TRUE #Set to FALSE before submitting
require(rstan)
## Loading required package: rstan
## Loading required package: StanHeaders
## Loading required package: ggplot2
## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
set.seed(1) #set seed 

Reading the data

First we read in the data

data = read.csv('https://raw.githubusercontent.com/MaggieLieu/STAN_tutorials/master/Hierarchical/premiereleague.csv',col.names = c('Home','score1', 'score2', 'Away'), stringsAsFactors = FALSE)
data
ng = nrow(data)
cat('we have G =', ng, 'games \n')
## we have G = 328 games
nt = length(unique(data$Home))
cat('We have T = ', nt, 'teams \n')
## We have T =  20 teams

Traditional method

We will assume that the goals scored come from a poisson distribution

s1 | theta_g1 ~ Poisson(theta_g1) #game g score by home team s2 | theta_g2 ~ Poisson(theta_g2) #game g score by away team

Assuming a log-linear random effect model log(theta_g1) = home + att_ht + def_at log(theta_g2) = att_at + def_ht

where home is a constant for the advantage for the team hosting the game att and def are the attack and defence abilities of the teams where the indices at,ht correspond to the t=1-20 teams.

priors we willl use for the attack and defence abilities are very wide, essentially the teams’ performances are independent of each other home ~ normal(0,0.0001) att[t] ~ normal(0, 2) def[t] ~ normal(0, 2)

Now convert team names for each match into numbers

teams = unique(data$Home)
ht = unlist(sapply(1:ng, function(g) which(teams == data$Home[g])))
at = unlist(sapply(1:ng, function(g) which(teams == data$Away[g])))
# we will save the last 5 games to predict
#np=5
np=250
ngob = ng-np #number of games to fit
my_data = list(
  nt = nt, 
  ng = ngob,
  ht = ht[1:ngob], 
  at = at[1:ngob], 
  s1 = data$score1[1:ngob],
  s2 = data$score2[1:ngob],
  np = np,
  htnew = ht[(ngob+1):ng],
  atnew = at[(ngob+1):ng]
)
nhfit = stan(file = 'non_hier_model.stan', data = my_data)

Plot the predicted scores of the last 5 matches

nhparams = extract(nhfit)
pred_scores = c(colMeans(nhparams$s1new),colMeans(nhparams$s2new))
true_scores = c(data$score1[(ngob+1):ng],data$score2[(ngob+1):ng] )
plot(true_scores, pred_scores, xlim=c(0,5), ylim=c(0,5), pch=20, ylab='predicted scores', xlab='true scores')
abline(a=0,  b=1, lty='dashed')

pred_errors = c(sapply(1:np, function(x) sd(nhparams$s1new[,x])),sapply(1:np, function(x) sd(nhparams$s1new[,x])))
arrows(true_scores, pred_scores+pred_errors, true_scores, pred_scores-pred_errors, length = 0.05, angle = 90, code = 3, col=rgb(0,0,0,0.3))

mean((pred_scores - true_scores)^2)
## [1] 1.651242

We can also look at the attack/defense of the teams:

attack = colMeans(nhparams$att)
defense = colMeans(nhparams$def)

plot(attack,defense,xlim=c(-0.4,1.1), main='Non-Hierachical Model')
abline(h=0)
abline(v=0)
text(attack,defense, labels=teams, cex=0.7, pos=4)

plot(density(nhparams$home))

sd(nhparams$home)
## [1] 0.09839375

Hierarchical model

In a hierarchical model, the parameters of interest, in our case the attack and defense ability are drawn from the population distribution. att[t] ~ normal(mu_att, tau_att) def[t] ~ normal(mu_def, tau_def)

Instead we define priors on the population, known as the hyperpriors. mu_att = normal(0,0.0001) tau_att = gamma(0.1,0.1) mu_def = normal(0, 0.0001) tau_def = gamma(0.1,0.1)

hfit = stan(file = 'hier_model.stan', data = my_data)
## Warning: There were 224 divergent transitions after warmup. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
pairs(hfit, pars=c('mu_att', 'tau_att', 'mu_def', 'tau_def'))

hparams = extract(hfit)
pred_scores = c(colMeans(hparams$s1new),colMeans(hparams$s2new))
pred_errors = c(sapply(1:np, function(x) sd(hparams$s1new[,x])),sapply(1:np, function(x) sd(hparams$s1new[,x])))
true_scores = c(data$score1[(ngob+1):ng],data$score2[(ngob+1):ng] )
plot(true_scores, pred_scores, xlim=c(0,5), ylim=c(0,5), pch=20, ylab='predicted scores', xlab='true scores')
abline(a=0,  b=1, lty='dashed')
arrows(true_scores, pred_scores+pred_errors, true_scores, pred_scores-pred_errors, length = 0.05, angle = 90, code = 3, rgb(0,0,0,0.3))

sqrt(mean((pred_scores - true_scores)^2))
## [1] 1.143771
attack = colMeans(hparams$att)
attacksd = sapply(1:nt, function(x) sd(hparams$att[,x]))
defense = colMeans(hparams$def)
defensesd = sapply(1:nt, function(x) sd(hparams$def[,x]))

plot(attack,defense, xlim=c(-0.4,1), ylim=c(-0.45,0.3), pch=20)
arrows(attack-attacksd, defense, attack+attacksd, defense, code=3, angle = 90, length = 0.04, col=rgb(0,0,0,0.2))
arrows(attack, defense-defensesd, attack, defense+defensesd, code=3, angle = 90, length = 0.04,col=rgb(0,0,0,0.2))
#abline(h=0)
#abline(v=0)
text(attack,defense, labels=teams, cex=0.7, adj=c(-0.05,-0.8) )

mean(attack)
## [1] -2.083023e-18
mean(defense)
## [1] -4.004772e-19
plot(density(hparams$home))

sd(hparams$home)
## [1] 0.09236712
mean(hparams$home)
## [1] 0.3756149