Walk through the code and try to understand it. Answer the following questions:
Plot the home advantage in both models
The model predicts the last 150 games:
This notebook is a tutorial on Hierarchical modelling using Stan which is adopted from https://github.com/MaggieLieu/STAN_tutorials/blob/master/Hierarchical/Hierarchical.Rmd. The model is also described in the paper: https://discovery.ucl.ac.uk/id/eprint/16040/1/16040.pdf
Modifications include:
Removing of the pooled model
Removing the fixed prior for home advantage (which has a std of about 1e-4)
lsg = TRUE #Set to FALSE before submitting
require(rstan)
## Loading required package: rstan
## Loading required package: StanHeaders
## Loading required package: ggplot2
## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
set.seed(1) #set seed
First we read in the data
data = read.csv('https://raw.githubusercontent.com/MaggieLieu/STAN_tutorials/master/Hierarchical/premiereleague.csv',col.names = c('Home','score1', 'score2', 'Away'), stringsAsFactors = FALSE)
data
ng = nrow(data)
cat('we have G =', ng, 'games \n')
## we have G = 328 games
nt = length(unique(data$Home))
cat('We have T = ', nt, 'teams \n')
## We have T = 20 teams
We will assume that the goals scored come from a poisson distribution
s1 | theta_g1 ~ Poisson(theta_g1) #game g score by home team s2 | theta_g2 ~ Poisson(theta_g2) #game g score by away team
Assuming a log-linear random effect model log(theta_g1) = home + att_ht + def_at log(theta_g2) = att_at + def_ht
where home is a constant for the advantage for the team hosting the game att and def are the attack and defence abilities of the teams where the indices at,ht correspond to the t=1-20 teams.
priors we willl use for the attack and defence abilities are very wide, essentially the teams’ performances are independent of each other home ~ normal(0,0.0001) att[t] ~ normal(0, 2) def[t] ~ normal(0, 2)
Now convert team names for each match into numbers
teams = unique(data$Home)
ht = unlist(sapply(1:ng, function(g) which(teams == data$Home[g])))
at = unlist(sapply(1:ng, function(g) which(teams == data$Away[g])))
# we will save the last 5 games to predict
#np=5
np=250
ngob = ng-np #number of games to fit
my_data = list(
nt = nt,
ng = ngob,
ht = ht[1:ngob],
at = at[1:ngob],
s1 = data$score1[1:ngob],
s2 = data$score2[1:ngob],
np = np,
htnew = ht[(ngob+1):ng],
atnew = at[(ngob+1):ng]
)
nhfit = stan(file = 'non_hier_model.stan', data = my_data)
nhparams = extract(nhfit)
pred_scores = c(colMeans(nhparams$s1new),colMeans(nhparams$s2new))
true_scores = c(data$score1[(ngob+1):ng],data$score2[(ngob+1):ng] )
plot(true_scores, pred_scores, xlim=c(0,5), ylim=c(0,5), pch=20, ylab='predicted scores', xlab='true scores')
abline(a=0, b=1, lty='dashed')
pred_errors = c(sapply(1:np, function(x) sd(nhparams$s1new[,x])),sapply(1:np, function(x) sd(nhparams$s1new[,x])))
arrows(true_scores, pred_scores+pred_errors, true_scores, pred_scores-pred_errors, length = 0.05, angle = 90, code = 3, col=rgb(0,0,0,0.3))
mean((pred_scores - true_scores)^2)
## [1] 1.651242
We can also look at the attack/defense of the teams:
attack = colMeans(nhparams$att)
defense = colMeans(nhparams$def)
plot(attack,defense,xlim=c(-0.4,1.1), main='Non-Hierachical Model')
abline(h=0)
abline(v=0)
text(attack,defense, labels=teams, cex=0.7, pos=4)
plot(density(nhparams$home))
sd(nhparams$home)
## [1] 0.09839375
In a hierarchical model, the parameters of interest, in our case the attack and defense ability are drawn from the population distribution. att[t] ~ normal(mu_att, tau_att) def[t] ~ normal(mu_def, tau_def)
Instead we define priors on the population, known as the hyperpriors. mu_att = normal(0,0.0001) tau_att = gamma(0.1,0.1) mu_def = normal(0, 0.0001) tau_def = gamma(0.1,0.1)
hfit = stan(file = 'hier_model.stan', data = my_data)
## Warning: There were 224 divergent transitions after warmup. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
pairs(hfit, pars=c('mu_att', 'tau_att', 'mu_def', 'tau_def'))
hparams = extract(hfit)
pred_scores = c(colMeans(hparams$s1new),colMeans(hparams$s2new))
pred_errors = c(sapply(1:np, function(x) sd(hparams$s1new[,x])),sapply(1:np, function(x) sd(hparams$s1new[,x])))
true_scores = c(data$score1[(ngob+1):ng],data$score2[(ngob+1):ng] )
plot(true_scores, pred_scores, xlim=c(0,5), ylim=c(0,5), pch=20, ylab='predicted scores', xlab='true scores')
abline(a=0, b=1, lty='dashed')
arrows(true_scores, pred_scores+pred_errors, true_scores, pred_scores-pred_errors, length = 0.05, angle = 90, code = 3, rgb(0,0,0,0.3))
sqrt(mean((pred_scores - true_scores)^2))
## [1] 1.143771
attack = colMeans(hparams$att)
attacksd = sapply(1:nt, function(x) sd(hparams$att[,x]))
defense = colMeans(hparams$def)
defensesd = sapply(1:nt, function(x) sd(hparams$def[,x]))
plot(attack,defense, xlim=c(-0.4,1), ylim=c(-0.45,0.3), pch=20)
arrows(attack-attacksd, defense, attack+attacksd, defense, code=3, angle = 90, length = 0.04, col=rgb(0,0,0,0.2))
arrows(attack, defense-defensesd, attack, defense+defensesd, code=3, angle = 90, length = 0.04,col=rgb(0,0,0,0.2))
#abline(h=0)
#abline(v=0)
text(attack,defense, labels=teams, cex=0.7, adj=c(-0.05,-0.8) )
mean(attack)
## [1] -2.083023e-18
mean(defense)
## [1] -4.004772e-19
plot(density(hparams$home))
sd(hparams$home)
## [1] 0.09236712
mean(hparams$home)
## [1] 0.3756149