In summary the model is:
The number of goals scored by the home and by the visiting team in the gth game as \(y_{g1}\) and \(y_{g2}\) respectively. The elements of the vector of observed counts \(\mathbf{y} = (y_{g1}, y_{g2})\) are modelled as independent Poisson:
\[y_{gj}|\lambda_{gj} \sim Poisson(\lambda_{gj})\]
conditionally on the parameters \(\mathbf{\lambda} = (\lambda_{g1},\lambda_{g2})\). which represent the scoring intensity or the expected goals in the \(gth\) for the team playing at home (\(j = 1\)) and visiting (\(j = 2\)). We also suppose that exists for every team some latent parameters which represent its attack and defense abilities. In that way, the scoring intensity \(\mathbf{\lambda}\) could be explained as a combination of these parameteres and maybe other effects. As it is natural for Poisson distribution, the logarithmic is applied over the \(\mathbf{\lambda}\) parameter as link function for defining the generalized linera model. In such way we’ll have:
\[log(\lambda_{g1}) = \gamma + \eta*(1-I(nv) ) + \alpha_{h(g)} + \beta_{a(g)}\] \[log(\lambda_{g2}) = \gamma - \eta*(1-I(nv) ) + \alpha_{a(g)} + \beta_{h(g)}\]
where \(nv\) indicate *neutral venue" and \(I(nv)\) is an indicatior function over \(nv\) that indicate if the game is played in neutral venue or not. So \(I(nv) = 1\) for a neutral venue game, and \(I(nv) = 0\) otherwise. The nested indexes \(h(g), a(g)\) identify the team playing at home and visiting (away) in the \(gth\) game.
Sum-to-zero (STZ) constraints: The following constraint is imposed over the set of all teams:
\[\sum_{i = 1}^{T} \alpha_i = 0 \; \;\; \;\; \;\; \;\; \;\; \;\; \; \sum_{i = 1}^{T} \beta_i = 0\]
What is equivalent to setting the average-abilities team as baseline. The interpreation of the parameters we did lines up was using the STZ constraint. On another hand, these constraints will be incorporated in future models taking into account that:
\[\alpha_1 = -\sum_{i = 2}^{T} \alpha_i \; \;\; \;\; \;\; \;\; \;\; \;\; \;\beta_1 = - \sum_{i = 1}^{T} \beta_i\]
The first things to do is getting data:
file_root <- "~/MEGA/smart_odds/project"
db_path <- "data/football_Italy_db.sqlite3"
# 3. Cleaning and Wrangling
## Connecting with db
df_full_path <- str_c(file_root, db_path, sep = "/")
football_db <- dbConnect(drv = SQLite(), dbname= df_full_path)
df <- tbl(football_db, "df_model") %>% collect()
df_team_id <- tbl(football_db, "look-up-teams") %>% collect()
RSQLite::dbDisconnect(football_db)
Let’s split the set in fit and test sets.
dfFit <- df %>% filter(Season < 2016)
dfTest <- df %>% filter(Season >= 2016)
Preparing the data set for fitting the model:
# Create a new dataframe for fitting data
df_mle <- bind_rows(
data_frame(goals = dfFit$FTHG,
a = dfFit$HomeTeam_id,
d = dfFit$AwayTeam_id,
eta = 1),
data_frame(goals = dfFit$FTAG,
a = dfFit$AwayTeam_id,
d = dfFit$HomeTeam_id,
eta = -1))
df_mle$a <- factor(df_mle$a, levels = unique(df_mle$a))
df_mle$d <- factor(df_mle$d, levels = unique(df_mle$a))
nteams <- length(unique(df_mle$a)) # teams 30, 31, 32 are
df_mle
## # A tibble: 3,798 x 4
## goals a d eta
## <int> <fct> <fct> <dbl>
## 1 2 1 15 1
## 2 1 2 16 1
## 3 0 3 18 1
## 4 2 4 20 1
## 5 2 5 14 1
## 6 2 6 13 1
## 7 4 7 17 1
## 8 0 8 19 1
## 9 4 9 12 1
## 10 1 10 11 1
## # ... with 3,788 more rows
Fitting the model:
library(broom)
# Set constrains
contrasts(df_mle$a) <- contr.sum(nteams, contrasts=TRUE)
contrasts(df_mle$d) <- contr.sum(nteams, contrasts=TRUE)
mle_model <- glm(goals ~ 1 + eta + a + d, family = poisson(link = log), data = df_mle)
By default the team and opponent 29 are fixed (constraint):
# df_mle_tidy <- df_mle_tidy %> % select(term, estimate)
tidyr_MLE <- function(mle_model, df_team_id, name_model){
# Make a tidy dataframe
df_mle_tidy <- tidy(mle_model, exponentiate = FALSE)
# Let's change names
df_mle_tidy[1, "term"] <- "gamma"
# By defoult glm take the team 29 as fixed
c_a29 <- df_mle_tidy %>%
filter(str_detect(term, "a")) %>%
pull(estimate)
c_d29 <- df_mle_tidy %>%
filter(str_detect(term, "d")) %>%
pull(estimate)
df_MLE_fitted <- bind_rows(
df_mle_tidy,
data_frame(term = "a29", estimate = -sum(c_a29)),
data_frame(term = "d29", estimate = -sum(c_d29)))
# Select interested columns
df_MLE_fitted <- df_MLE_fitted %>% select(term, estimate)
# Strengths
# Extract strengths
df_MLE_strength <- df_MLE_fitted %>%
filter(str_detect(term, ".*[0-9].*"))
# Wrangling with some columns
df_MLE_strength <- df_MLE_strength %>%
separate(term, into = c("param","team_id"), sep = "(?<=[a-z])(?=[0-9])") %>%
mutate(team_id = as.numeric(team_id)) %>%
select(team_id, param, value = estimate)
# Joining with
df_MLE_strength_tidy <- df_MLE_strength %>%
inner_join(df_team_id, by = "team_id") %>%
select(team, param, value) %>%
group_by(team) %>%
nest()
# Intercepts
df_MLE_intercept_tidy <- df_MLE_fitted %>%
filter(!str_detect(term, ".*[0-9].*")) %>%
rename(param = term, value = estimate) %>%
group_by(param) %>%
nest()
# Put all in data frame
df_MLE_model_tidy <- data_frame(model = name_model, strengths = list(df_MLE_strength_tidy), intercepts = list(df_MLE_intercept_tidy))
return(df_MLE_model_tidy)
}
df_MLE_model_tidy <- tidyr_MLE(mle_model, df_team_id, name_model = "mle")
df_MLE_model_tidy
## # A tibble: 1 x 3
## model strengths intercepts
## <chr> <list> <list>
## 1 mle <tibble [29 x 2]> <tibble [2 x 2]>
Another way for fitting the model is going throw bayesian framework. Instead of maximize the likelihood of the parameters, we compute the probability of all set of parameters could fit the data. As we will see, define the model in bayesian way is more straight forward and intuitive than mle.
bayesian_model <- "model{
for (i in 1:ngames){
# Likelihood
goals_home[i] ~ dpois(lambda_home[i])
goals_away[i] ~ dpois(lambda_away[i])
# linear predictor
log(lambda_home[i]) <- gamma + eta + a[ ht[i] ] + d[ at[i] ]
log(lambda_away[i]) <- gamma - eta + a[ at[i] ] + d[ ht[i] ]
}
# Non-informative priors
## Prior over intercepts
gamma ~ dnorm(0, 0.00001)
eta ~ dnorm(0, 0.00001)
## Prior over teams strength.
for (i in 2:nteams){
a[i] ~ dnorm(0, 0.00001)
d[i] ~ dnorm(0, 0.00001)
}
# Model constraints
## STZ constraint
a[1] <- -sum(a[2:nteams])
d[1] <- -sum(d[2:nteams])
}"
ngames <- nrow(dfFit)
nteams <- length(unique(dfFit$HomeTeam))
## Setting up the bayesian model
# Define data needed for setting up the model
data_bayes_model <- list(
goals_home = dfFit$FTHG,
goals_away = dfFit$FTAG,
ht = dfFit$HomeTeam_id,
at = dfFit$AwayTeam_id,
ngames = ngames,
nteams = nteams
)
# Using mle for setting inits
generate_chains_inits <- function(nchains = 3){
# Use MLE for start the chain
a_init <- df_MLE_model_tidy$strengths[[1]] %>%
unnest(data) %>%
filter(param == "a") %>%
pull(value)
a_init[1] <- NA
d_init <- df_MLE_model_tidy$strengths[[1]] %>%
unnest(data) %>%
filter(param == "d") %>%
pull(value)
d_init[1] <- NA
eta_init <- df_MLE_model_tidy$intercepts[[1]] %>%
unnest(data) %>%
filter(param == "eta") %>%
pull(value)
gamma_init <- df_MLE_model_tidy$intercepts[[1]] %>%
unnest(data) %>%
filter(param == "gamma") %>%
pull(value)
init_chain <- function(x){
# SAmpling each chain with different seed
inits_chains_i <- list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = x,
"eta" = eta_init,
"gamma" = gamma_init,
"a" = a_init,
"d" = d_init
)
return(inits_chains_i)
}
inits_chains <- map(1:nchains, init_chain)
# names(inits_chains) <- paste0("init", 1:nchains)
return(inits_chains)
}
# generate_chains_inits(3)
# model_inits = list(
# gamma = 0.2,
# eta = 0.2,
# a = c(NA, rep(0, 28)),
# d = c(NA, rep(0, 28))
# )
# Define model
nchains <- 3
param <- c("eta", "gamma", "a", "d")
# generate_chains_inits(nchains)
# save(file="/tmp/jags.Rdata", list="jags")
# load("/tmp/jags.Rdata")
# jags$recompile()
# mod <- jags.model(textConnection(bayesian_model), data = data_bayes_model, n.chain = nchains, inits = generate_chains_inits(nchains))
# update(mod, 500)
# nsamples <- 5000
# samples <- coda.samples(mod, variable.names = param, n.iter = nsamples)
Saving samples:
# Save amples
save_samples <- function(samples, name_file){
file_root <- "~/MEGA/smart_odds/project/"
sample_path <- "scripts/final_project/models/samples/"
full_path <- str_c(file_root, sample_path, sep = "")
saveRDS(samples, file = str_c(full_path, name_file, ".Rdata", sep = ""))
}
# Load samples
load_samples <- function(name_file){
file_root <- "~/MEGA/smart_odds/project/"
sample_path <- "scripts/final_project/models/samples/"
full_path <- str_c(file_root, sample_path, sep = "")
samples <- readRDS(file = str_c(full_path, name_file,".RData", sep = ""))
return(samples)
}
# name_file = "samples"
# save_samples(samples, "samples")
samples <- load_samples("samples")
# summary(samples)
Looking at Naive SE we can see the standard error of the montecarlo estimation, so it is accurate enough for our analysis.
library(coda)
library(ggmcmc)
head(summary(samples)[[1]])
## Mean SD Naive SE Time-series SE
## a[1] 0.28583006 0.05870074 0.0004792895 0.0005061592
## a[2] -0.40091390 0.12744884 0.0010406154 0.0038613748
## a[3] -0.03204455 0.08729715 0.0007127783 0.0019488248
## a[4] -0.24883885 0.07519309 0.0006139490 0.0015548589
## a[5] 0.25951148 0.05888918 0.0004808282 0.0010337590
## a[6] 0.04672540 0.06495076 0.0005303207 0.0011571603
Let’s see how look like the Markov Chain sampled parameters. As there are too much parameters, we will see only the intercepts and first strengths. It looks very good.
df_MCMC_model_tidy <- ggmcmc::ggs(samples)
# df_MCMC_model_tidy %>% filter(Parameter == "d[11]") %>% ggmcmc::ggs_traceplot()
# Trace plot
df_MCMC_model_tidy %>%
filter(Parameter %in% c("gamma", "eta", "a[1]", "d[1]")) %>% ggmcmc::ggs_traceplot()
# Autocorrelation plot
df_MCMC_model_tidy %>%
filter(Parameter %in% c("gamma", "eta", "a[1]", "d[1]")) %>% ggmcmc::ggs_autocorrelation()
# Density plot
df_MCMC_model_tidy %>%
filter(Parameter %in% c("gamma", "eta", "a[1]", "d[1]")) %>% ggmcmc::ggs_density()
Let’s sort a little bit the output, joining with team names:
tidyr_MCMC <- function(samplesMCMC, df_team_id, model_name){
# Tidyr markov chain
df_MCMC_tidy <- ggs(samplesMCMC)
# Strengths
# Extract strengths
df_MCMC_strength <- df_MCMC_tidy %>%
filter(str_detect(Parameter, "\\["))
# Wrangling with some columns
df_MCMC_strength <- df_MCMC_strength %>%
separate(Parameter, into = c("param","team_id"), sep = "\\[|\\]", remove = FALSE, extra = "drop") %>%
mutate(team_id = as.numeric(team_id)) %>%
select(team_id, param, value)
# Joining with
df_MCMC_strength_tidy <- df_MCMC_strength %>%
inner_join(df_team_id, by = "team_id") %>%
select(team, param, value) %>%
group_by(team) %>% nest()
# Intercepts
df_MCMC_intercepts <- df_MCMC_tidy %>%
filter(!str_detect(Parameter, "\\[")) %>%
rename(param = Parameter)
df_MCMC_intercept_tidy <- df_MCMC_intercepts %>%
select(param, value) %>%
group_by(param) %>%
nest()
df_MCMC_model_tidy <- data_frame(model = model_name, strengths = list(df_MCMC_strength_tidy), intercepts = list(df_MCMC_intercept_tidy))
return(df_MCMC_model_tidy)
}
#football_db <- dbConnect(drv = SQLite(), dbname= df_full_path)
#dbListTables(football_db)
# df_team_id <- tbl(football_db, "look-up-teams") %>% collect()
df_MCMC_model_tidy <- tidyr_MCMC(samples, df_team_id, "bay")
# df_MCMC_model_tidy$strengths[[1]] %>%
# filter(team == "Cagliari") %>%
# unnest(data) %>% group_by(param) %>%
# ggplot(aes(x = value, fill = param)) +
# geom_density(aes(y =..density..), position = "dodge")
#
#
# df_MCMC_model_tidy$strengths[[1]] %>%
# filter(team == "Cagliari") %>%
# unnest(data) %>% filter(param == "d") %>%
# mutate(index = 1:nrow(.)) %>%
# ggplot(aes(x = index, y = value)) + geom_line()
#
# df_team_id
Now we can save both models in the same dataframe.
df_both_models <- bind_rows(df_MCMC_model_tidy, df_MLE_model_tidy)
Let’s going to compare both models.
# Extract map from parameters
extract_MAP_strength <- function(df, strength) {
x <- df %>% filter(param == strength) %>%
pull(value)
d <- density(x)
d$x[which.max(d$y)]
}
# df_both_models[1, "strengths"]$strengths[[1]][1,]$data[[1]]
df_MAP_strengths <- df_both_models[which(df_both_models$model == "bay"), "strengths"]$strengths[[1]] %>%
mutate(a_MAP = map_dbl(data, extract_MAP_strength, "a")) %>%
mutate(d_MAP = map_dbl(data, extract_MAP_strength, "d")) %>%
select(team, a_MAP, d_MAP)
# Put together both models
df_MLE_strengths <- df_both_models[which(df_both_models$model == "mle"), "strengths"]$strengths[[1]] %>%
unnest(data) %>%
spread(param, value) %>%
rename(a_MLE = a, d_MLE = d)
df_MAP_MLE_strength <- df_MAP_strengths %>%
inner_join(df_MLE_strengths, by = "team") %>%
select(team, a_MAP, a_MLE, d_MAP, d_MLE)
# # Another expression
# df_MAP_MLE_strength_b <- bind_rows(df_MAP_strengths %>% gather(key, estimation, -team) %>%
# separate(key, sep = "_", into = c("strength", "model"))
# ,
# df_MLE_strengths %>% gather(key, estimation, -team) %>%
# separate(key, sep = "_", into = c("strength", "model")))
####### INTERCEPTS ##########
extract_MAP_intercepts <- function(df) {
x <- df %>%
pull(value)
d <- density(x)
d$x[which.max(d$y)]
}
df_MAP_intercepts <- df_both_models[which(df_both_models$model == "bay"),]$intercepts[[1]] %>%
mutate(intercepts_MAP = map_dbl(data, extract_MAP_intercepts)) %>% select(-data)
# Put together both models
df_MLE_intercepts <- df_both_models[which(df_both_models$model == "mle"),]$intercepts[[1]] %>%
unnest(data) %>%
rename(intercepts_MLE = value)
df_MAP_MLE_intercepts <- df_MAP_intercepts %>%
inner_join(df_MLE_intercepts, by = "param")
df_MAP_MLE_intercepts %>% mutate_if(.predicate = is.numeric, .funs = round, 2) %>% print(n = Inf)
## # A tibble: 2 x 3
## param intercepts_MAP intercepts_MLE
## <chr> <dbl> <dbl>
## 1 eta 0.14 0.14
## 2 gamma 0.21 0.22
df_MAP_MLE_strength %>% mutate_if(.predicate = is.numeric, .funs = round, 2) %>% print(n = Inf)
## # A tibble: 29 x 5
## team a_MAP a_MLE d_MAP d_MLE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Milan 0.290 0.28 -0.2 -0.2
## 2 Cesena -0.38 -0.4 0.23 0.22
## 3 Catania -0.03 -0.03 0.04 0.04
## 4 Chievo -0.25 -0.25 -0.12 -0.12
## 5 Fiorentina 0.26 0.26 -0.16 -0.17
## 6 Genoa 0.04 0.04 0.01 0.01
## 7 Juventus 0.46 0.45 -0.83 -0.84
## 8 Lecce -0.1 -0.12 0.08 0.08
## 9 Palermo -0.01 -0.02 0.13 0.11
## 10 Roma 0.39 0.39 -0.22 -0.22
## 11 Cagliari -0.1 -0.1 0.06 0.04
## 12 Inter 0.22 0.22 -0.11 -0.1
## 13 Atalanta -0.11 -0.12 -0.04 -0.05
## 14 Bologna -0.19 -0.21 -0.06 -0.07
## 15 Lazio 0.22 0.22 -0.11 -0.11
## 16 Napoli 0.48 0.47 -0.21 -0.22
## 17 Parma 0.06 0.05 0.05 0.04
## 18 Siena -0.11 -0.11 -0.02 -0.02
## 19 Udinese 0.04 0.04 -0.04 -0.04
## 20 Novara -0.25 -0.24 0.21 0.22
## 21 Pescara -0.49 -0.48 0.47 0.45
## 22 Torino 0.14 0.12 -0.05 -0.04
## 23 Sampdoria 0.05 0.03 0.03 0.02
## 24 Verona 0.09 0.08 0.21 0.2
## 25 Livorno -0.13 -0.13 0.35 0.35
## 26 Sassuolo 0.05 0.04 0.05 0.05
## 27 Empoli -0.03 -0.05 -0.04 -0.06
## 28 Frosinone -0.24 -0.23 0.34 0.35
## 29 Carpi -0.19 -0.56 0.08 0.06
As we can see, both models match to each other. That’s because we selected non-informative priors for the parameters. Difference between results can be explained for many factors.
The next step is computing the deviance on the test set. We are going to use two models. First model is the MLE/MAP and the second one a fully bayesian approach, it means we will use all samples for estimating the probability of new data.
As we can see in this plot, there are teams in the season 2016 and 2017 which don’t have any data. These teams are Frosinone, Carpi, Benevento.
df_participation <- df %>%
group_by(HomeTeam) %>%
summarise(participation = length(unique(Season))) %>%
arrange(participation) %>%
mutate(ord = 1:length(participation)) %>%
rename(team = HomeTeam)
ggplot(data = inner_join(df, df_participation, by = c("HomeTeam"="team")),
aes(x = factor(Season), y = fct_reorder(HomeTeam, ord))) +
geom_point() +
labs(x = "Seasons", y = "Teams") +
ggtitle("Team participation across seasons") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Goals")
We will reproduce the abilities of such teams using as reference teams that have promoted on past seasons:
plot_strength <- function(df_MCMC_model_tidy, df_participation, teams_no_data, teams_promoted){
df_order <- df_MCMC_model_tidy$strengths[[1]] %>%
unnest(data) %>%
group_by(team, param) %>%
summarise(avg = mean(value)) %>%
ungroup() %>% spread(param, avg) %>%
arrange(desc(a)) %>% mutate(order_a = 1:n()) %>%
arrange(d) %>% mutate(order_d = 1:n())
gg_a <- df_MCMC_model_tidy$strengths[[1]] %>%
unnest(data) %>%
mutate(promoted_color = if_else(team %in% teams_promoted, "reference",
if_else(team %in% teams_no_data, "no_data", "others"))) %>%
group_by(team, param) %>%
summarise(avg = mean(value),
HDPI_75_l = rethinking::HPDI(value, prob = 0.75)[[1]], HDPI_75_r = rethinking::HPDI(value, prob = 0.75)[[2]],
HDPI_95_l = rethinking::HPDI(value, prob = 0.95)[[1]], HDPI_95_r = rethinking::HPDI(value, prob = 0.95)[[2]],
promoted_color = first(promoted_color)) %>%
ungroup() %>% filter(param == "a") %>%
inner_join(df_order, by = "team") %>%
inner_join(df_participation, by = "team") %>%
ggplot(aes(x = avg, y = fct_reorder(team, desc(order_a)))) +
geom_point(size = 3, shape = 21, aes(fill = promoted_color)) +
geom_errorbarh(aes(xmin = HDPI_75_l,xmax = HDPI_75_r)) +
geom_errorbarh(aes(xmin = HDPI_95_l,xmax = HDPI_95_r), col = "black", alpha = 0.2) +
geom_text(aes(label = participation), hjust = 11) +
ggtitle("Team's attack strength ") +
labs(x ="attack stregth",y ="teams")
gg_d <- df_MCMC_model_tidy$strengths[[1]] %>%
unnest(data) %>%
mutate(promoted_color = if_else(team %in% teams_promoted, "reference",
if_else(team %in% teams_no_data, "no_data", "others"))) %>%
group_by(team, param) %>%
summarise(avg = mean(value),
HDPI_75_l = rethinking::HPDI(value, prob = 0.75)[[1]], HDPI_75_r = rethinking::HPDI(value, prob = 0.75)[[2]],
HDPI_95_l = rethinking::HPDI(value, prob = 0.95)[[1]], HDPI_95_r = rethinking::HPDI(value, prob = 0.95)[[2]],
promoted_color = first(promoted_color)) %>%
ungroup() %>%
filter(param == "d") %>%
inner_join(df_order, by = "team") %>%
inner_join(df_participation, by = "team") %>%
ggplot(aes(x = avg, y = fct_reorder(team, desc(order_d)))) +
geom_point(size = 3, shape = 21, aes(fill = promoted_color)) +
geom_errorbarh(aes(xmin = HDPI_75_l,xmax = HDPI_75_r)) +
geom_errorbarh(aes(xmin = HDPI_95_l,xmax = HDPI_95_r), col = "black", alpha = 0.2) +
geom_text(aes(label = participation), hjust = 11) +
ggtitle("Team's defense weakness ") +
labs(x="defence weakness",y="teams")
gridExtra::grid.arrange( grobs = list(gg_a, gg_d))
}
teams_no_data <- c("Crotone", "Spal", "Benevento")
teams_promoted <- c("Pescara", "Sampdoria", "Torino",
"Livorno", "Sassuolo", "Verona",
"Cesena", "Empoli", "Palermo",
"Bologna", "Carpi", "Frosinone")
plot_strength(df_both_models[which(df_both_models$model == "bay"),], df_participation, teams_no_data, teams_promoted)
Livorno, Frosine, Pescara, Cesena seems to be similar to those we want to simulated: “they have participated no more than 2 times, and have ever promoted to the league. I want to simulate the no-data-teams based on these one.Spal and Benevento will have a grade of similarity of (0.4, 0.4, 0.1, 0.1) and as Crotone did not relegate on 2016, I will give it more similarity to them who have been 2 times, so: (0.1, 0.1, 0.4, 0.4) . This is because I think these new teams are more simular to those who have played just 1 game and less similiar to them who played 2. So I will assign more weight to the first ones.
generate_fake_teams <- function(df_MCMC_model_tidy, teams_no_data, teams_reference, lOfProbs){
gen_weighted_teams <- function(teams_no_data, teams_reference, lOfProbs){
# put teams_nodata, teams_reference and lOfProbs in a data_frame
# Generate df with information about teams no data props and reference teams-
lOfDf_teams_reference <- map2(teams_no_data, lOfProbs, ~data_frame(teams_no_data = .x, probs = .y, team = teams_reference))
return(lOfDf_teams_reference)
}
generate_a_fake_team <- function(team_no_data, df_teams_reference){
# extract in variables
teams_reference <- df_teams_reference$team
probs <- df_teams_reference$probs/sum(df_teams_reference$probs)
# Filter the whole df with the reference teams
df_strength_reference <- df_strength_tidy %>%
filter(team %in% teams_reference)
# number of samples to take for each parameter
nsamples <- nrow(df_strength_reference$data[[1]])/2
# set.seed(length(team_no_data))
sample_params <- function(i){
# this function extract just one sample from attack and defence
# sample a team from which extract ability, given probs
ref_team_sampled <- sample(teams_reference, size = 1, prob = probs)
# Sample an attack strength
a <- df_strength_reference %>%
filter(team == ref_team_sampled) %>%
unnest(data) %>%
filter(param == "a") %>%
pull(value) %>%
sample(1)
# sample a defence weakness
d <- df_strength_reference %>%
filter(team == ref_team_sampled) %>%
unnest(data) %>%
filter(param == "d") %>%
pull(value) %>%
sample(1)
return(data_frame(a, d))
}
# take nsamples
fake_team_generated <- furrr::future_map_dfr(1:nsamples, sample_params)
# transform
df_fake_team_ready <- fake_team_generated %>%
gather(param, value) %>%
mutate(team = team_no_data) %>%
group_by(team) %>%
nest()
return(df_fake_team_ready)
}
#### CALL FUNCTIONS #####
lOfDf_teams_reference <- gen_weighted_teams(teams_no_data, teams_reference, lOfProbs)
df_strength_tidy <- df_MCMC_model_tidy$strengths[[1]]
df_teams_gen <- furrr::future_map2_dfr(.x = teams_no_data,
.y = lOfDf_teams_reference,
generate_a_fake_team,
.progress = TRUE)
return(df_teams_gen)
}
# Declare teams no data
teams_no_data <- c("Benevento", "Spal", "Crotone")
# Declare teams as reference
teams_reference <- c("Livorno", "Frosinone", "Pescara", "Cesena")
lOfProbs <- list(probs_reference_Benevento = c(0.3, 0.3, 0.2, 0.2),
probs_reference_Spal = c(0.3, 0.3, 0.2, 0.2),
probs_reference_Crotone = c(0.2, 0.2, 0.3, 0.3))
# fake_teams <- generate_fake_teams(df_MCMC_model_tidy, teams_no_data, teams_reference, lOfProbs)
# save_samples(fake_teams, name_file="fake_teams")
fake_teams <- load_samples("fake_teams")
fake_teams_MAP <- fake_teams %>%
mutate(a = map_dbl(data, extract_MAP_strength, "a")) %>%
mutate(d = map_dbl(data, extract_MAP_strength, "d")) %>%
select(-data) %>%
gather(param, value, -team) %>%
group_by(team) %>%
nest()
# Include this team on the whole data_frame
# MCMC
df_both_models[which(df_both_models$model == "bay"), ]$strengths[[1]] <- bind_rows(df_both_models[which(df_both_models$model == "bay"), ]$strengths[[1]], fake_teams)
# bay
df_both_models[which(df_both_models$model == "mle"), ]$strengths[[1]] <- bind_rows(df_both_models[which(df_both_models$model == "mle"), ]$strengths[[1]], fake_teams_MAP)
# Let's see how they look
# Crotone
look_at_team_params <- function(df_MCMC_model_tidy, look_at_team){
df_MCMC_model_tidy$strengths[[1]] %>%
filter(team == look_at_team) %>%
unnest(data) %>%
ggplot(aes(x = value, fill = param)) +
geom_density(alpha = 0.6) +
ggtitle(paste(look_at_team," abilities")) +
xlab("parm estimation") +
theme(plot.title = element_text(hjust = 0.5))
}
look_at_team_params(df_both_models[which(df_both_models$model == "bay"), ], look_at_team ="Crotone")
look_at_team_params(df_both_models[which(df_both_models$model == "bay"), ], look_at_team ="Benevento")
look_at_team_params(df_both_models[which(df_both_models$model == "bay"), ], look_at_team ="Spal")
# Benevento
# Spal
plot_strength(df_both_models[which(df_both_models$model == "bay"), ], df_participation, teams_no_data, teams_promoted)
The only problem with this method is that now the constraints are not.
Let’s compute \(\lambda_H\) and \(\lambda_A\) for each game on the test
compute_samples_lambda_forEachGame <- function(i, df_model, df_games){
# i across all the games of the df_game
# df with all games
# df_model = filter(df_both_models, model == "mle")
i = 4
# Extract intercepts
intercepts <- df_model %>%
select(intercepts) %>%
unnest()
## Eta
eta <- intercepts %>%
filter(param == "eta") %>%
unnest(data) %>%
pull(value)
## Gamma
gamma <- intercepts %>%
filter(param == "gamma") %>%
unnest(data) %>%
pull(value)
# Stengths
teams_strength <- df_model %>%
select(strengths) %>%
unnest()
# Extract Home and Away teams in game i
HomeTeam <- pull(df_games[i, "HomeTeam"])
AwayTeam <- pull(df_games[i, "AwayTeam"])
# Extract abilities for both teams
# HOME
attack_home <- teams_strength %>%
filter(team == HomeTeam) %>%
unnest(data) %>%
filter(param == "a") %>%
pull(value)
defense_home <- teams_strength %>%
filter(team == HomeTeam) %>%
unnest(data) %>%
filter(param == "d") %>%
pull(value)
# AWAY
attack_away <- teams_strength %>%
filter(team == AwayTeam) %>%
unnest(data) %>%
filter(param == "a") %>%
pull(value)
defense_away <- teams_strength %>%
filter(team == AwayTeam) %>%
unnest(data) %>%
filter(param == "d") %>%
pull(value)
# Calculate log lambda
log_lambda_home <- gamma + eta + attack_home + defense_away
log_lambda_away <- gamma + eta + attack_away + defense_home
# Exp. to lambda
lambda_home <- exp(log_lambda_home)
lambda_away <- exp(log_lambda_away)
df_lambdas <- data_frame(lambda_home = list(lambda_home), lambda_away = list(lambda_away))
return(df_lambdas)
}
dfTest <- dfTest %>% select(id_game, HomeTeam, AwayTeam, HomeTeam_id, AwayTeam_id, FTHG, FTAG, Season)
match <- compute_samples_lambda_forEachGame(i = 4, filter(df_both_models, model == "mle"), dfTest)
# compute_samples_lambda_forEachGame <- function(i, df_both_models, df_games, model_name = "bay")
dfTry <- dfTest[1:nrow(dfTest), ]
df_LambdaSamples_MCMC <- furrr::future_map_dfr(1:nrow(dfTry), compute_samples_lambda_forEachGame, filter(df_both_models, model == "bay"), dfTry)
df_LambdaSamples_MLE <- furrr::future_map_dfr(.x = 1:nrow(dfTry),.f = compute_samples_lambda_forEachGame, filter(df_both_models, model == "mle"), dfTry)
# Augmenting the df
dfTest_MCMC_augmented <- bind_cols(dfTest, df_LambdaSamples_MCMC)
dfTest_MLE_augmented <- bind_cols(dfTest, df_LambdaSamples_MLE)
compute_deviance <- function(i, df_games_augmented){
# Calculate the probability of goals across each set of model parameter
lambda_home <- df_games_augmented[i,"lambda_home"] %>%
unnest() %>%
pull()
lambda_away <- df_games_augmented[i,"lambda_away"] %>%
unnest() %>%
pull()
home_goals <- pull(df_games_augmented[i,"FTHG"])
away_goals <- pull(df_games_augmented[i,"FTAG"])
# WAIC = -2*(lppd + pwaic)
# lppd
loglik_home <- dpois(x = home_goals, lambda = lambda_home, log = TRUE)
loglik_away <- dpois(x = away_goals, lambda = lambda_away, log = TRUE)
lik_home <- dpois(x = home_goals, lambda = lambda_home, log = FALSE)
lik_away <- dpois(x = away_goals, lambda = lambda_away, log = FALSE)
deviance_home <- -2*log(mean(lik_home))
deviance_away <- -2*log(mean(lik_away))
df_deviance <- data_frame(deviance = deviance_home + deviance_away)
return(df_deviance)
}
deviance_MCMC_each_game <- furrr::future_map_dfr(1:nrow(dfTest), compute_deviance, dfTest_MCMC_augmented)
deviance_MLE_each_game <- furrr::future_map_dfr(1:nrow(dfTest), compute_deviance, dfTest_MLE_augmented)
models_deviance <- bind_rows(
deviance_MLE_each_game %>%
mutate(type = "mle"),
deviance_MCMC_each_game %>%
mutate(type = "mcmc"))
models_deviance %>%
ggplot(aes(x = deviance, fill = type, colour = type)) +
geom_density(adjust = 4, alpha = 0.5)
models_deviance %>%
group_by(type) %>%
summarise(deviance = sum(deviance)) %>%
mutate(ddev = round(deviance - min(deviance),3)) %>%
mutate(bayes_factor = round(exp(-0.5*ddev),3)) %>%
mutate(probs_model = round(bayes_factor/sum(bayes_factor),3))
## # A tibble: 2 x 5
## type deviance ddev bayes_factor probs_model
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 mcmc 4731. 0 1 1
## 2 mle 4765. 34.4 0 0
Let’s take 1x2 odds
file_root <- "~/MEGA/smart_odds/project"
db_path <- "data/football_Italy_db.sqlite3"
# 3. Cleaning and Wrangling
## Connecting with db
df_full_path <- str_c(file_root, db_path, sep = "/")
football_db <- dbConnect(drv = SQLite(), dbname= df_full_path)
RSQLite::dbListTables(football_db)
## [1] "df_model" "game_info" "game_market_values"
## [4] "game_numbers" "look-up-teams" "sqlite_stat1"
## [7] "sqlite_stat4"
df_odds <- tbl(football_db, "game_market_values") %>%
select(id_game:BSA) %>%
collect()
df_whowin <- tbl(football_db, "game_numbers") %>%
select(id_game, FTR) %>%
collect()
RSQLite::dbDisconnect(football_db)
Average over all bookies
get_market_probabilities <- function(what_games){
file_root <- "~/MEGA/smart_odds/project"
db_path <- "data/football_Italy_db.sqlite3"
df_full_path <- str_c(file_root, db_path, sep = "/")
football_db <- dbConnect(drv = SQLite(), dbname= df_full_path)
RSQLite::dbListTables(football_db)
df_odds <- tbl(football_db, "game_market_values") %>%
select(id_game:BSA) %>%
collect()
RSQLite::dbDisconnect(football_db)
# Averaging across all market
df_market_odds <- bind_cols(df_odds %>% select(ends_with("H")) %>%
transmute(H = rowMeans(., na.rm = TRUE)),
df_odds %>% select(ends_with("D")) %>%
transmute(D = rowMeans(., na.rm = TRUE)),
df_odds %>% select(ends_with("A")) %>%
transmute(A = rowMeans(., na.rm = TRUE)))
#There are some rows without data
# df_odds[which(!complete.cases(df_market_odds)) ,]
df_market_probs <- df_market_odds %>% mutate_all(.funs = ~1/.x) %>%
bind_cols(df_odds %>% select(id_game)) %>%
drop_na() %>%
mutate(s = H + D + A) %>%
mutate(H = H/s, D = D/s, A = A/s) %>%
select(id_game, H, D, A) %>%
mutate(type = "market")
df_market_probs <- df_market_probs %>%
filter(id_game %in% what_games)
return(df_market_probs)
}
df_market_probs <- get_market_probabilities(what_games = unique(dfTest_MLE_augmented$id_game))
Simulate 10000 each game from the different models to compute the 1x2 probabilities
# Simulate new games
## Calculate the number of new simulations
simulate_new_games <- function(df_model_lambdas, nsim){
df_sim_games <- df_model_lambdas %>%
transmute(id_game,
sim_home = map(lambda_home, ~rpois(nsim, .x)),
sim_away = map(lambda_away, ~rpois(nsim, .x)))
return(df_sim_games)
}
estimate_1x2_probs <- function(df_sim_games, type){
df_1x2_probs_games <- df_sim_games %>%
select(id_game, sim_home, sim_away) %>%
unnest(sim_home, sim_away) %>%
mutate(whowin = if_else(sim_home > sim_away, "H",
if_else(sim_home < sim_away, "A","D"))) %>%
group_by(id_game, whowin) %>%
summarise(n = n()) %>%
mutate(sim_probs = n/sum(n)) %>%
select(-n) %>%
spread(whowin, sim_probs) %>%
select(id_game, everything()) %>%
mutate(type = type)
return(df_1x2_probs_games)
}
set.seed(1994)
nsim <- dfTest_MCMC_augmented[1, "lambda_home"] %>%
unnest() %>%
nrow()
df_sim_games_MCMC <- simulate_new_games(dfTest_MCMC_augmented %>% select(id_game, lambda_home, lambda_away), nsim)
df_sim_games_MLE <- simulate_new_games(dfTest_MLE_augmented %>% select(id_game, lambda_home, lambda_away), nsim)
df_MCMC_sim_probs <- estimate_1x2_probs(df_sim_games_MCMC, type = "mcmc")
df_MLE_sim_probs <- estimate_1x2_probs(df_sim_games_MLE, type = "mle")
# Load whowin table
football_db <- dbConnect(drv = SQLite(), dbname= df_full_path)
RSQLite::dbListTables(football_db)
## [1] "df_model" "game_info" "game_market_values"
## [4] "game_numbers" "look-up-teams" "sqlite_stat1"
## [7] "sqlite_stat4"
df_whowin <- tbl(football_db, "game_numbers") %>%
select(id_game, FTR) %>%
collect()
RSQLite::dbDisconnect(football_db)
# bind dataframes
df_models_probs_1x2 <- bind_rows(df_market_probs, df_MLE_sim_probs, df_MCMC_sim_probs)
# join column
df_models_probs_1x2 <- df_models_probs_1x2 %>%
inner_join(df_whowin)
## Joining, by = "id_game"
# Compute the prob
df_models_probs_1x2 <- df_models_probs_1x2 %>%
mutate(likelihood = if_else(FTR == "H", H,
if_else(FTR == "A", A, D))) %>%
mutate(deviance = -2*log(likelihood))
df_comparison_models <- df_models_probs_1x2 %>% group_by(type) %>%
summarise(deviance = sum(deviance)) %>%
mutate(ddev = round(deviance - min(deviance),3)) %>%
mutate(bayes_factor = round(exp(-0.5*ddev),3)) %>%
mutate(probs_model = round(bayes_factor/sum(bayes_factor),3))
print(df_comparison_models)
## # A tibble: 3 x 5
## type deviance ddev bayes_factor probs_model
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 market 1364. 0 1 1
## 2 mcmc 1645. 282. 0 0
## 3 mle 1653. 289. 0 0
Assuming the Efficient-Market hypothesis the average prices across all bookis would reflect the true probability distribution. So let’s compute distance between our distribution and the actual/tre distribution. We will comput the Kullback-Leibler divergence between the models and the supposed tru distribution.
\[D_{KL} = \sum_i p_ilog\frac{p_i}{q_i})\] where i goes across the r.v. support, \(p\) represent the true probabilitiy and \(q\) is the model probability
compute_KL_divergence <- function(df_models_probs, str_q_model, str_p_model ){
df_models_probs <- df_models_probs %>%
select(id_game, H, D, A, type)
df_q_model <- df_models_probs %>%
filter(type == str_q_model) %>%
select(H, D, A)
df_p_model <- df_models_probs %>%
filter(type == str_p_model) %>%
select(H, D, A)
rowwise_crossEntropy <- function(i, df_q_model, df_p_model){
# get a vector with the distribution p
p <- df_p_model %>%
slice(i) %>%
unlist(use.names = FALSE)
# get a vector with the distribution q
q <- df_q_model %>%
slice(i) %>%
unlist(use.names = FALSE)
log_p <- log(p + 1e-24) # Stabiliy
log_q <- log(q + 1e-24) # Stabillity
div_kl_i <- sum(p*(log(p) - log(q)))
return(data_frame(div_kl_i))
}
df_KL <- future_map_dfr(1:nrow(df_p_model), rowwise_crossEntropy, df_q_model, df_p_model)
df_KL <- df_KL %>% mutate(type = str_q_model)
return(df_KL)
}
df_KL_market_mcmc <- compute_KL_divergence(df_models_probs_1x2, str_q_model = "mcmc", str_p_model = "market")
df_KL_market_mle <- compute_KL_divergence(df_models_probs_1x2, str_q_model = "mle", str_p_model = "market")
# df_KL_market_mcmc %>% ggplot(aes(entropy_i)) + geom_density()
df_KL_models <- bind_rows(df_KL_market_mcmc, df_KL_market_mle)
df_KL_models %>% group_by(type) %>%
summarise(divergence = sum(div_kl_i), avg_divergence = mean(div_kl_i))
## # A tibble: 2 x 3
## type divergence avg_divergence
## <chr> <dbl> <dbl>
## 1 mcmc 102. 0.135
## 2 mle 105. 0.138