1 Introduction

In this document, we are going to explore the European Football dataset available in SQLite on Kaggle. First, let’s have a look at the available data:

con <- dbConnect(SQLite(), dbname="database.sqlite" )
# dbListTables(con)

player       <- tbl_df(dbGetQuery(con,"SELECT * FROM player"))
player_stats <- tbl_df(dbGetQuery(con,"SELECT * FROM player_attributes"))
country      <- tbl_df(dbGetQuery(con,"SELECT * FROM Country"))
league       <- tbl_df(dbGetQuery(con,"SELECT * FROM League"))
match        <- tbl_df(dbGetQuery(con,"SELECT * FROM Match"))
team         <- tbl_df(dbGetQuery(con,"SELECT * FROM Team"))
team_a       <- tbl_df(dbGetQuery(con,"SELECT * FROM Team_Attributes"))

We have

  • 11 leagues
  • 299 teams
  • 11060 players
  • 25979 matches
match <- match %>% rename(H=home_team_api_id, A=away_team_api_id,
                          GH=home_team_goal,  GA=away_team_goal)
team <- team %>% mutate(H=team_api_id, A=team_api_id)

match$H <- left_join(match, team, by="H") %>% select(team_long_name) %>% pull()
match$A <- left_join(match, team, by="A") %>% select(team_long_name) %>% pull()

2 Home vs. Away

First, let’s check if we find any evidence for home advantage. Indeed, 46% of the matches were Home wins.

library(knitr)
# EDA --------------
# Possible distributions: Poisson, Gamma, Rayleigh

round(prop.table(table(ifelse(match$GH > match$GA, "Home", ifelse(match$GH<match$GA, "Away", "Draw")))),2)
## 
## Away Draw Home 
## 0.29 0.25 0.46

Also, the Home team scored more goals on average.

X <- match %>% dplyr::select(GH,GA) %>% gather()
kable(X %>% group_by(key) %>% summarise(Average.Goals=mean(value),
                                  Variance=var(value)))
key Average.Goals Variance
GA 1.160938 1.304416
GH 1.544594 1.682619

The following plot shows that the Poisson distribution fits the number of goals scored very well.

n <- nrow(X)
X <- rbind(cbind(type="Goals",X), 
           cbind(type="Poisson",
                 key = rep(c("GH","GA"),each=n/2),
                 value = rpois(n,rep(c(1.545,1.161), each=n/2))))


X1 <- X %>% dplyr::filter(value != 10)
ggplot(X1, aes(x=value, fill=type)) + 
   geom_bar(color="Black", alpha=0.4, position="identity") +
   facet_grid(key~.)

# fitdistr(match$GH, "Poisson")
# fitdistr(match$GA, "Poisson")

The following plot shows the most common results.

In 69.2 % of the cases, no team scored more than two goals.

results <- match %>% transmute(Result = paste0(GH,"-",GA)) %>% pull()
results <- factor(results, levels= names(sort(table(results), decreasing = T)))

ggplot(data.frame(results), aes(x=results)) + 
   geom_bar() +
   theme(axis.text.x = element_text(angle=90))

3 Ratings

The teams don’t have ratings, the players do. Here, we ignore the time dimension and only consider the highest rating a player achieved during his career.

First, let’s find out the all time top10 players:

left_join(
   player_stats %>% group_by(player_api_id) %>% summarise(rating=max(overall_rating)) %>% arrange(-rating),
   player %>% select(player_api_id, player_name),
   by="player_api_id") %>% head(10) %>% kable
player_api_id rating player_name
30981 94 Lionel Messi
30717 93 Gianluigi Buffon
30829 93 Wayne Rooney
30893 93 Cristiano Ronaldo
39854 92 Xavi Hernandez
39989 92 Gregory Coupet
30626 91 Thierry Henry
30627 91 John Terry
30657 91 Iker Casillas
30723 91 Alessandro Nesta

Now, let’s create a team rating based on the average rating of its starting line-up. Here are the top-10 teams:

# Team Ratings
X2 <- match[complete.cases(match[,56:77]),]
# names(X2[,56:77])
plyrs <- X2[,c(7,56:77)] %>% gather(key, player_api_id, -match_api_id)
rting <- player_stats %>% group_by(player_api_id) %>% summarise(rating = max(overall_rating))
X3 <- left_join(plyrs,rting,by="player_api_id")
X3 <- na.omit(X3)

Hind <- as.vector(sapply(X3$key, function(x) grepl("home",x)))
Aind <- as.vector(sapply(X3$key, function(x) grepl("away",x)))

X4 <- left_join(X3[Hind,] %>% group_by(match_api_id) %>% summarise(Hrating=mean(rating)),
                X3[Aind,] %>% group_by(match_api_id) %>% summarise(Arating=mean(rating)),
                by="match_api_id")

X5 <- right_join(match,X4, by="match_api_id") 

X5$rel <- X5$Hrating - X5$Arating
X5$Gdiff <- (X5$GH - X5$GA)

topteams <- X5 %>% 
   select(H, Hrating) %>% 
   group_by(H) %>%
   summarise(avg= mean(Hrating)) %>%
   arrange(-avg) %>%
   head(n=10) 

kable(topteams)
H avg
FC Barcelona 86.51416
Real Madrid CF 86.38113
Chelsea 85.43818
FC Bayern Munich 85.23626
Manchester United 84.14161
Juventus 84.13511
Manchester City 83.78052
Atlético Madrid 83.26635
Inter 83.02601
Milan 82.99392
plotd <- X5 %>% filter(H %in% topteams$H) %>% 
   group_by(.dots=c("H","season")) %>%
   summarise(avg=mean(Hrating))


ggplot(plotd,aes(x=season,y=avg, group=H, col=H)) + geom_point(size=4,alpha=0.9) +
   geom_line(size=2, alpha=0.6) +
   scale_color_brewer(type = "div",palette = 4,direction = 1) +
   theme_hc()

The following histogram shows that most of the team ratings are between 64 and 85 (95% CI).

hist(X5$Hrating)

# hist(X5$rel)
# hist(X5$GH)
# hist(X5$Gdiff)

The following plot shows that Goal difference positively depends on the relative difference in team ratings. E.g.: If the home team has a higher rating than the away team, it is more likely to have a positive goal-difference (-> Win)

qplot(data=X5, x=rel, y=jitter(Gdiff), xlab = "Home-Rating minus Away-Rating",ylab="Goal difference")

# ggplot(X5, aes(x=factor(Gdiff), y=rel)) + geom_boxplot() 
# ggplot(X5, aes(x=cut(rel,10), y=Gdiff)) + geom_boxplot() 

Let’s predict which team wins according to the relative rating of each team (no draws):

X6 <- X5 %>% dplyr::select(GH,GA,rel)

outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
# naive   <- ifelse(X6[,3] > 0, "H", ifelse(X6[,3] < -1, "A","D"))
naive   <- ifelse(X6[,3] > 1, "H", "A")
table(outcome,naive)
##        naive
## outcome    A    H
##       A 4826 1340
##       D 3351 2047
##       H 4136 5674

Accuracy = 49.1 %. Note we don’t get Draws in this case.

4 Modelling

Now, we try to model the data with a poisson regression. We have one regression for Home goals, another for Away goals:

# Modelling
summary(mod1 <- glm(GH ~ rel, data=X5))
## 
## Call:
## glm(formula = GH ~ rel, data = X5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7691  -0.8781  -0.1602   0.7118   8.0848  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.552141   0.008347  185.94   <2e-16 ***
## rel         0.093134   0.001692   55.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.489106)
## 
##     Null deviance: 36337  on 21373  degrees of freedom
## Residual deviance: 31825  on 21372  degrees of freedom
## AIC: 69171
## 
## Number of Fisher Scoring iterations: 2
summary(mod2 <- glm(GA ~ rel, data=X5))
## 
## Call:
## glm(formula = GA ~ rel, data = X5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2247  -0.8434  -0.1643   0.6477   6.8902  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.179068   0.007474  157.76   <2e-16 ***
## rel         -0.072614   0.001515  -47.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.19371)
## 
##     Null deviance: 28255  on 21373  degrees of freedom
## Residual deviance: 25512  on 21372  degrees of freedom
## AIC: 64445
## 
## Number of Fisher Scoring iterations: 2
X6 <- data.frame(GH=X5$GH, GA=X5$GA, GHp=round(mod1$fit), GAp=round(mod2$fit))
# Exactly the proportion, but corrected for Home advantage
# X6 <- cbind(GH=X5$GH, GA=X5$GA, GHp=     (mod1$fit)-0.375, GAp=     (mod2$fit))

outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
predict <- ifelse(X6[,3] > X6[,4], "H", ifelse(X6[,3] < X6[,4], "A", "D"))

table(outcome,predict)
##        predict
## outcome    A    D    H
##       A 2204 1956 2006
##       D  985 1576 2837
##       H  740 2097 6973

Accuracy = 50.3 %

Note that this is exactly the same as the first prediction but corrected for home advantage.

Let’s only consider the matches where the difference between ratings is at least 5 (31.3 % of all matches)

noeq <- abs(X5$rel)>5
table(outcome[noeq],predict[noeq])
##    
##        A    H
##   A 1922  293
##   D  792  569
##   H  594 2528

Accuracy = 66.4 %

As expected, the results get more accurate if we only consider matches witch a big difference in team ratings.

5 Simulations

Now let’s compare simulations. We are going to create a simulation based on random [H,D,A] outcomes and a simulation where goals follow a poisson r.v. with lambda relative to team ratings.

The Poisson simulation performs slightly better. However, the results are disappointing.

outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
table(outcome)
## outcome
##    A    D    H 
## 6166 5398 9810
n <- length(outcome)
rep <- 1000
sensit <- 0.7

sim <- matrix(ncol=rep,sample(x    = c("A","D","H"),
                      prob = prop.table(table(outcome)),
                      size = n*rep, 
                      repl = T))


Gd <- rpois(n*rep,lambda = max(0 , 1.54 + sensit*X5$rel)) - 
      rpois(n*rep,lambda = max(0 , 1.16 - sensit*X5$rel))
sim2 <- matrix(ncol = rep,byrow = F, 
              ifelse(Gd > 0, "H", ifelse(Gd<0, "A","D")))

sim <- data.frame(Random = apply(sim,2, function(x) sum(outcome == x)) / n,
           Poisson= apply(sim2,2, function(x) sum(outcome == x)) / n) %>%
   gather()



ggplot(sim,aes(x=value, fill=key)) + geom_histogram(bins = 50, position = "identity") + scale_x_continuous(labels=percent) 

Finally, we want to find out if left foot players are generally better players. No - they are not!

players <- 
  player_stats %>% 
  group_by(player_api_id) %>% 
  top_n(n = 1, wt = date) %>%
  as.data.frame()

# names(players)
players$preferred_foot <- factor(players$preferred_foot)
players <- na.omit(players)

prop.table(table(players$preferred_foot))
## 
##      left     right 
## 0.2445725 0.7554275
ggplot(players, aes(x=preferred_foot, y=overall_rating)) + geom_jitter()

6 Going further

To see an interactive simulation of the Europa League, go to http://stvrd.shinyapps.io/uefa