In this document, we are going to explore the European Football dataset available in SQLite on Kaggle. First, let’s have a look at the available data:
con <- dbConnect(SQLite(), dbname="database.sqlite" )
# dbListTables(con)
player <- tbl_df(dbGetQuery(con,"SELECT * FROM player"))
player_stats <- tbl_df(dbGetQuery(con,"SELECT * FROM player_attributes"))
country <- tbl_df(dbGetQuery(con,"SELECT * FROM Country"))
league <- tbl_df(dbGetQuery(con,"SELECT * FROM League"))
match <- tbl_df(dbGetQuery(con,"SELECT * FROM Match"))
team <- tbl_df(dbGetQuery(con,"SELECT * FROM Team"))
team_a <- tbl_df(dbGetQuery(con,"SELECT * FROM Team_Attributes"))
We have
match <- match %>% rename(H=home_team_api_id, A=away_team_api_id,
GH=home_team_goal, GA=away_team_goal)
team <- team %>% mutate(H=team_api_id, A=team_api_id)
match$H <- left_join(match, team, by="H") %>% select(team_long_name) %>% pull()
match$A <- left_join(match, team, by="A") %>% select(team_long_name) %>% pull()
First, let’s check if we find any evidence for home advantage. Indeed, 46% of the matches were Home wins.
library(knitr)
# EDA --------------
# Possible distributions: Poisson, Gamma, Rayleigh
round(prop.table(table(ifelse(match$GH > match$GA, "Home", ifelse(match$GH<match$GA, "Away", "Draw")))),2)
##
## Away Draw Home
## 0.29 0.25 0.46
Also, the Home team scored more goals on average.
X <- match %>% dplyr::select(GH,GA) %>% gather()
kable(X %>% group_by(key) %>% summarise(Average.Goals=mean(value),
Variance=var(value)))
| key | Average.Goals | Variance |
|---|---|---|
| GA | 1.160938 | 1.304416 |
| GH | 1.544594 | 1.682619 |
The following plot shows that the Poisson distribution fits the number of goals scored very well.
n <- nrow(X)
X <- rbind(cbind(type="Goals",X),
cbind(type="Poisson",
key = rep(c("GH","GA"),each=n/2),
value = rpois(n,rep(c(1.545,1.161), each=n/2))))
X1 <- X %>% dplyr::filter(value != 10)
ggplot(X1, aes(x=value, fill=type)) +
geom_bar(color="Black", alpha=0.4, position="identity") +
facet_grid(key~.)
# fitdistr(match$GH, "Poisson")
# fitdistr(match$GA, "Poisson")
The following plot shows the most common results.
In 69.2 % of the cases, no team scored more than two goals.
results <- match %>% transmute(Result = paste0(GH,"-",GA)) %>% pull()
results <- factor(results, levels= names(sort(table(results), decreasing = T)))
ggplot(data.frame(results), aes(x=results)) +
geom_bar() +
theme(axis.text.x = element_text(angle=90))
The teams don’t have ratings, the players do. Here, we ignore the time dimension and only consider the highest rating a player achieved during his career.
First, let’s find out the all time top10 players:
left_join(
player_stats %>% group_by(player_api_id) %>% summarise(rating=max(overall_rating)) %>% arrange(-rating),
player %>% select(player_api_id, player_name),
by="player_api_id") %>% head(10) %>% kable
| player_api_id | rating | player_name |
|---|---|---|
| 30981 | 94 | Lionel Messi |
| 30717 | 93 | Gianluigi Buffon |
| 30829 | 93 | Wayne Rooney |
| 30893 | 93 | Cristiano Ronaldo |
| 39854 | 92 | Xavi Hernandez |
| 39989 | 92 | Gregory Coupet |
| 30626 | 91 | Thierry Henry |
| 30627 | 91 | John Terry |
| 30657 | 91 | Iker Casillas |
| 30723 | 91 | Alessandro Nesta |
Now, let’s create a team rating based on the average rating of its starting line-up. Here are the top-10 teams:
# Team Ratings
X2 <- match[complete.cases(match[,56:77]),]
# names(X2[,56:77])
plyrs <- X2[,c(7,56:77)] %>% gather(key, player_api_id, -match_api_id)
rting <- player_stats %>% group_by(player_api_id) %>% summarise(rating = max(overall_rating))
X3 <- left_join(plyrs,rting,by="player_api_id")
X3 <- na.omit(X3)
Hind <- as.vector(sapply(X3$key, function(x) grepl("home",x)))
Aind <- as.vector(sapply(X3$key, function(x) grepl("away",x)))
X4 <- left_join(X3[Hind,] %>% group_by(match_api_id) %>% summarise(Hrating=mean(rating)),
X3[Aind,] %>% group_by(match_api_id) %>% summarise(Arating=mean(rating)),
by="match_api_id")
X5 <- right_join(match,X4, by="match_api_id")
X5$rel <- X5$Hrating - X5$Arating
X5$Gdiff <- (X5$GH - X5$GA)
topteams <- X5 %>%
select(H, Hrating) %>%
group_by(H) %>%
summarise(avg= mean(Hrating)) %>%
arrange(-avg) %>%
head(n=10)
kable(topteams)
| H | avg |
|---|---|
| FC Barcelona | 86.51416 |
| Real Madrid CF | 86.38113 |
| Chelsea | 85.43818 |
| FC Bayern Munich | 85.23626 |
| Manchester United | 84.14161 |
| Juventus | 84.13511 |
| Manchester City | 83.78052 |
| Atlético Madrid | 83.26635 |
| Inter | 83.02601 |
| Milan | 82.99392 |
plotd <- X5 %>% filter(H %in% topteams$H) %>%
group_by(.dots=c("H","season")) %>%
summarise(avg=mean(Hrating))
ggplot(plotd,aes(x=season,y=avg, group=H, col=H)) + geom_point(size=4,alpha=0.9) +
geom_line(size=2, alpha=0.6) +
scale_color_brewer(type = "div",palette = 4,direction = 1) +
theme_hc()
The following histogram shows that most of the team ratings are between 64 and 85 (95% CI).
hist(X5$Hrating)
# hist(X5$rel)
# hist(X5$GH)
# hist(X5$Gdiff)
The following plot shows that Goal difference positively depends on the relative difference in team ratings. E.g.: If the home team has a higher rating than the away team, it is more likely to have a positive goal-difference (-> Win)
qplot(data=X5, x=rel, y=jitter(Gdiff), xlab = "Home-Rating minus Away-Rating",ylab="Goal difference")
# ggplot(X5, aes(x=factor(Gdiff), y=rel)) + geom_boxplot()
# ggplot(X5, aes(x=cut(rel,10), y=Gdiff)) + geom_boxplot()
Let’s predict which team wins according to the relative rating of each team (no draws):
X6 <- X5 %>% dplyr::select(GH,GA,rel)
outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
# naive <- ifelse(X6[,3] > 0, "H", ifelse(X6[,3] < -1, "A","D"))
naive <- ifelse(X6[,3] > 1, "H", "A")
table(outcome,naive)
## naive
## outcome A H
## A 4826 1340
## D 3351 2047
## H 4136 5674
Accuracy = 49.1 %. Note we don’t get Draws in this case.
Now, we try to model the data with a poisson regression. We have one regression for Home goals, another for Away goals:
# Modelling
summary(mod1 <- glm(GH ~ rel, data=X5))
##
## Call:
## glm(formula = GH ~ rel, data = X5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7691 -0.8781 -0.1602 0.7118 8.0848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.552141 0.008347 185.94 <2e-16 ***
## rel 0.093134 0.001692 55.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.489106)
##
## Null deviance: 36337 on 21373 degrees of freedom
## Residual deviance: 31825 on 21372 degrees of freedom
## AIC: 69171
##
## Number of Fisher Scoring iterations: 2
summary(mod2 <- glm(GA ~ rel, data=X5))
##
## Call:
## glm(formula = GA ~ rel, data = X5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2247 -0.8434 -0.1643 0.6477 6.8902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.179068 0.007474 157.76 <2e-16 ***
## rel -0.072614 0.001515 -47.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.19371)
##
## Null deviance: 28255 on 21373 degrees of freedom
## Residual deviance: 25512 on 21372 degrees of freedom
## AIC: 64445
##
## Number of Fisher Scoring iterations: 2
X6 <- data.frame(GH=X5$GH, GA=X5$GA, GHp=round(mod1$fit), GAp=round(mod2$fit))
# Exactly the proportion, but corrected for Home advantage
# X6 <- cbind(GH=X5$GH, GA=X5$GA, GHp= (mod1$fit)-0.375, GAp= (mod2$fit))
outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
predict <- ifelse(X6[,3] > X6[,4], "H", ifelse(X6[,3] < X6[,4], "A", "D"))
table(outcome,predict)
## predict
## outcome A D H
## A 2204 1956 2006
## D 985 1576 2837
## H 740 2097 6973
Accuracy = 50.3 %
Note that this is exactly the same as the first prediction but corrected for home advantage.
Let’s only consider the matches where the difference between ratings is at least 5 (31.3 % of all matches)
noeq <- abs(X5$rel)>5
table(outcome[noeq],predict[noeq])
##
## A H
## A 1922 293
## D 792 569
## H 594 2528
Accuracy = 66.4 %
As expected, the results get more accurate if we only consider matches witch a big difference in team ratings.
Now let’s compare simulations. We are going to create a simulation based on random [H,D,A] outcomes and a simulation where goals follow a poisson r.v. with lambda relative to team ratings.
The Poisson simulation performs slightly better. However, the results are disappointing.
outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
table(outcome)
## outcome
## A D H
## 6166 5398 9810
n <- length(outcome)
rep <- 1000
sensit <- 0.7
sim <- matrix(ncol=rep,sample(x = c("A","D","H"),
prob = prop.table(table(outcome)),
size = n*rep,
repl = T))
Gd <- rpois(n*rep,lambda = max(0 , 1.54 + sensit*X5$rel)) -
rpois(n*rep,lambda = max(0 , 1.16 - sensit*X5$rel))
sim2 <- matrix(ncol = rep,byrow = F,
ifelse(Gd > 0, "H", ifelse(Gd<0, "A","D")))
sim <- data.frame(Random = apply(sim,2, function(x) sum(outcome == x)) / n,
Poisson= apply(sim2,2, function(x) sum(outcome == x)) / n) %>%
gather()
ggplot(sim,aes(x=value, fill=key)) + geom_histogram(bins = 50, position = "identity") + scale_x_continuous(labels=percent)
Finally, we want to find out if left foot players are generally better players. No - they are not!
players <-
player_stats %>%
group_by(player_api_id) %>%
top_n(n = 1, wt = date) %>%
as.data.frame()
# names(players)
players$preferred_foot <- factor(players$preferred_foot)
players <- na.omit(players)
prop.table(table(players$preferred_foot))
##
## left right
## 0.2445725 0.7554275
ggplot(players, aes(x=preferred_foot, y=overall_rating)) + geom_jitter()
To see an interactive simulation of the Europa League, go to http://stvrd.shinyapps.io/uefa