1 Introduction

In this document, we are going to explore the European Football dataset available in SQLite on Kaggle. First, let’s have a look at the available data:

con <- dbConnect(SQLite(), dbname="database.sqlite" )
# dbListTables(con)

player       <- tbl_df(dbGetQuery(con,"SELECT * FROM player"))
player_stats <- tbl_df(dbGetQuery(con,"SELECT * FROM player_attributes"))
country      <- tbl_df(dbGetQuery(con,"SELECT * FROM Country"))
league       <- tbl_df(dbGetQuery(con,"SELECT * FROM League"))
match        <- tbl_df(dbGetQuery(con,"SELECT * FROM Match"))
team         <- tbl_df(dbGetQuery(con,"SELECT * FROM Team"))
team_a       <- tbl_df(dbGetQuery(con,"SELECT * FROM Team_Attributes"))

We have

11 leagues
299 teams
11060 players
25979 matches

match <- match %>% rename(H=home_team_api_id, A=away_team_api_id,
                          GH=home_team_goal,  GA=away_team_goal)
team <- team %>% mutate(H=team_api_id, A=team_api_id)

match$H <- left_join(match, team, by="H") %>% select(team_long_name) %>% pull()
match$A <- left_join(match, team, by="A") %>% select(team_long_name) %>% pull()

2 Home vs. Away

First, let’s check if we find any evidence for home advantage. Indeed, 46% of the matches were Home wins.

library(knitr)
# EDA --------------
# Possible distributions: Poisson, Gamma, Rayleigh

round(prop.table(table(ifelse(match$GH > match$GA, "Home", ifelse(match$GH<match$GA, "Away", "Draw")))),2)

## 
## Away Draw Home 
## 0.29 0.25 0.46

Also, the Home team scored more goals on average.

X <- match %>% dplyr::select(GH,GA) %>% gather()
kable(X %>% group_by(key) %>% summarise(Average.Goals=mean(value),
                                  Variance=var(value)))

key	Average.Goals	Variance
GA	1.160938	1.304416
GH	1.544594	1.682619

The following plot shows that the Poisson distribution fits the number of goals scored very well.

n <- nrow(X)
X <- rbind(cbind(type="Goals",X), 
           cbind(type="Poisson",
                 key = rep(c("GH","GA"),each=n/2),
                 value = rpois(n,rep(c(1.545,1.161), each=n/2))))


X1 <- X %>% dplyr::filter(value != 10)
ggplot(X1, aes(x=value, fill=type)) + 
   geom_bar(color="Black", alpha=0.4, position="identity") +
   facet_grid(key~.)

# fitdistr(match$GH, "Poisson")
# fitdistr(match$GA, "Poisson")

The following plot shows the most common results.

In 69.2 % of the cases, no team scored more than two goals.

results <- match %>% transmute(Result = paste0(GH,"-",GA)) %>% pull()
results <- factor(results, levels= names(sort(table(results), decreasing = T)))

ggplot(data.frame(results), aes(x=results)) + 
   geom_bar() +
   theme(axis.text.x = element_text(angle=90))

3 Ratings

The teams don’t have ratings, the players do. Here, we ignore the time dimension and only consider the highest rating a player achieved during his career.

First, let’s find out the all time top10 players:

left_join(
   player_stats %>% group_by(player_api_id) %>% summarise(rating=max(overall_rating)) %>% arrange(-rating),
   player %>% select(player_api_id, player_name),
   by="player_api_id") %>% head(10) %>% kable

player_api_id	rating	player_name
30981	94	Lionel Messi
30717	93	Gianluigi Buffon
30829	93	Wayne Rooney
30893	93	Cristiano Ronaldo
39854	92	Xavi Hernandez
39989	92	Gregory Coupet
30626	91	Thierry Henry
30627	91	John Terry
30657	91	Iker Casillas
30723	91	Alessandro Nesta

Now, let’s create a team rating based on the average rating of its starting line-up. Here are the top-10 teams:

# Team Ratings
X2 <- match[complete.cases(match[,56:77]),]
# names(X2[,56:77])
plyrs <- X2[,c(7,56:77)] %>% gather(key, player_api_id, -match_api_id)
rting <- player_stats %>% group_by(player_api_id) %>% summarise(rating = max(overall_rating))
X3 <- left_join(plyrs,rting,by="player_api_id")
X3 <- na.omit(X3)

Hind <- as.vector(sapply(X3$key, function(x) grepl("home",x)))
Aind <- as.vector(sapply(X3$key, function(x) grepl("away",x)))

X4 <- left_join(X3[Hind,] %>% group_by(match_api_id) %>% summarise(Hrating=mean(rating)),
                X3[Aind,] %>% group_by(match_api_id) %>% summarise(Arating=mean(rating)),
                by="match_api_id")

X5 <- right_join(match,X4, by="match_api_id") 

X5$rel <- X5$Hrating - X5$Arating
X5$Gdiff <- (X5$GH - X5$GA)

topteams <- X5 %>% 
   select(H, Hrating) %>% 
   group_by(H) %>%
   summarise(avg= mean(Hrating)) %>%
   arrange(-avg) %>%
   head(n=10) 

kable(topteams)

H	avg
FC Barcelona	86.51416
Real Madrid CF	86.38113
Chelsea	85.43818
FC Bayern Munich	85.23626
Manchester United	84.14161
Juventus	84.13511
Manchester City	83.78052
Atlético Madrid	83.26635
Inter	83.02601
Milan	82.99392

plotd <- X5 %>% filter(H %in% topteams$H) %>% 
   group_by(.dots=c("H","season")) %>%
   summarise(avg=mean(Hrating))


ggplot(plotd,aes(x=season,y=avg, group=H, col=H)) + geom_point(size=4,alpha=0.9) +
   geom_line(size=2, alpha=0.6) +
   scale_color_brewer(type = "div",palette = 4,direction = 1) +
   theme_hc()

The following histogram shows that most of the team ratings are between 64 and 85 (95% CI).

hist(X5$Hrating)

# hist(X5$rel)
# hist(X5$GH)
# hist(X5$Gdiff)

The following plot shows that Goal difference positively depends on the relative difference in team ratings. E.g.: If the home team has a higher rating than the away team, it is more likely to have a positive goal-difference (-> Win)

qplot(data=X5, x=rel, y=jitter(Gdiff), xlab = "Home-Rating minus Away-Rating",ylab="Goal difference")

# ggplot(X5, aes(x=factor(Gdiff), y=rel)) + geom_boxplot() 
# ggplot(X5, aes(x=cut(rel,10), y=Gdiff)) + geom_boxplot()

Let’s predict which team wins according to the relative rating of each team (no draws):

X6 <- X5 %>% dplyr::select(GH,GA,rel)

outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
# naive   <- ifelse(X6[,3] > 0, "H", ifelse(X6[,3] < -1, "A","D"))
naive   <- ifelse(X6[,3] > 1, "H", "A")
table(outcome,naive)

##        naive
## outcome    A    H
##       A 4826 1340
##       D 3351 2047
##       H 4136 5674

Accuracy = 49.1 %. Note we don’t get Draws in this case.

4 Modelling

Now, we try to model the data with a poisson regression. We have one regression for Home goals, another for Away goals:

# Modelling
summary(mod1 <- glm(GH ~ rel, data=X5))

## 
## Call:
## glm(formula = GH ~ rel, data = X5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7691  -0.8781  -0.1602   0.7118   8.0848  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.552141   0.008347  185.94   <2e-16 ***
## rel         0.093134   0.001692   55.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.489106)
## 
##     Null deviance: 36337  on 21373  degrees of freedom
## Residual deviance: 31825  on 21372  degrees of freedom
## AIC: 69171
## 
## Number of Fisher Scoring iterations: 2

summary(mod2 <- glm(GA ~ rel, data=X5))

## 
## Call:
## glm(formula = GA ~ rel, data = X5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2247  -0.8434  -0.1643   0.6477   6.8902  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.179068   0.007474  157.76   <2e-16 ***
## rel         -0.072614   0.001515  -47.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.19371)
## 
##     Null deviance: 28255  on 21373  degrees of freedom
## Residual deviance: 25512  on 21372  degrees of freedom
## AIC: 64445
## 
## Number of Fisher Scoring iterations: 2

X6 <- data.frame(GH=X5$GH, GA=X5$GA, GHp=round(mod1$fit), GAp=round(mod2$fit))
# Exactly the proportion, but corrected for Home advantage
# X6 <- cbind(GH=X5$GH, GA=X5$GA, GHp=     (mod1$fit)-0.375, GAp=     (mod2$fit))

outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
predict <- ifelse(X6[,3] > X6[,4], "H", ifelse(X6[,3] < X6[,4], "A", "D"))

table(outcome,predict)

##        predict
## outcome    A    D    H
##       A 2204 1956 2006
##       D  985 1576 2837
##       H  740 2097 6973

Accuracy = 50.3 %

Note that this is exactly the same as the first prediction but corrected for home advantage.

Let’s only consider the matches where the difference between ratings is at least 5 (31.3 % of all matches)

noeq <- abs(X5$rel)>5
table(outcome[noeq],predict[noeq])

##    
##        A    H
##   A 1922  293
##   D  792  569
##   H  594 2528

Accuracy = 66.4 %

As expected, the results get more accurate if we only consider matches witch a big difference in team ratings.

5 Simulations

Now let’s compare simulations. We are going to create a simulation based on random [H,D,A] outcomes and a simulation where goals follow a poisson r.v. with lambda relative to team ratings.

The Poisson simulation performs slightly better. However, the results are disappointing.

outcome <- ifelse(X6[,1] > X6[,2], "H", ifelse(X6[,1] < X6[,2], "A", "D"))
table(outcome)

## outcome
##    A    D    H 
## 6166 5398 9810

n <- length(outcome)
rep <- 1000
sensit <- 0.7

sim <- matrix(ncol=rep,sample(x    = c("A","D","H"),
                      prob = prop.table(table(outcome)),
                      size = n*rep, 
                      repl = T))


Gd <- rpois(n*rep,lambda = max(0 , 1.54 + sensit*X5$rel)) - 
      rpois(n*rep,lambda = max(0 , 1.16 - sensit*X5$rel))
sim2 <- matrix(ncol = rep,byrow = F, 
              ifelse(Gd > 0, "H", ifelse(Gd<0, "A","D")))

sim <- data.frame(Random = apply(sim,2, function(x) sum(outcome == x)) / n,
           Poisson= apply(sim2,2, function(x) sum(outcome == x)) / n) %>%
   gather()



ggplot(sim,aes(x=value, fill=key)) + geom_histogram(bins = 50, position = "identity") + scale_x_continuous(labels=percent)

Finally, we want to find out if left foot players are generally better players. No - they are not!

players <- 
  player_stats %>% 
  group_by(player_api_id) %>% 
  top_n(n = 1, wt = date) %>%
  as.data.frame()

# names(players)
players$preferred_foot <- factor(players$preferred_foot)
players <- na.omit(players)

prop.table(table(players$preferred_foot))

## 
##      left     right 
## 0.2445725 0.7554275

ggplot(players, aes(x=preferred_foot, y=overall_rating)) + geom_jitter()

6 Going further

To see an interactive simulation of the Europa League, go to http://stvrd.shinyapps.io/uefa

Football

Steve Riedo