Introduction

Soccer championship Euro 2016

My weekend project, to make myself familiar with the R package ggplot2: Analysis of an Euro 2016 Betting Game. Predictions were made, just for fun, by a small group of players from my workplace in Germany. There were ~300 players predicting ~50 games.

In this context, ‘Player’ means ‘Participant in the Betting Game’, not ‘Professional Soccer Player’.

The dataset is not public, because there are full names of real people inside. Maybe I’ll anonymize it at a later date.

Data preparation

library(MASS) # smooth method rlm
library(tidyr)
library(dplyr)
library(lubridate)
library(purrr)  # functional programming
library(stringr)
library(readr)
library(ggplot2)
library(ggrepel) # better labelling of datapoints
#library(gridExtra) # plots side-by-side
library(knitr) #kable

Current working directory:

getwd()
## [1] "/home/knut/code/git/_my/R_stuff/euro2016/gfzock"

Read in the data files with the final bets:

em16bets1 <- readLines("EM2016_compiled_final.csv")

Detect lines marking games:

em16games_ind <- str_detect(pattern = "^#",string = em16bets1)

Read in games-data from lists:

em16games <- tbl_df(data=em16bets1[em16games_ind]) %>% 
        separate(value, into=str_trim(c("mark",
                                "game_nr",
                                "team_a","team_b","game_date")),
                 sep= ";") %>%
        mutate(game_nr = as.integer(game_nr),
                game_date = ymd_hms(game_date), 
               mark=NULL)    %>%
        mutate(stage = ifelse(game_nr > 36 , "knockout", "group"))   

glimpse(em16games)
## Observations: 51
## Variables: 5
## $ game_nr   <int> 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, ...
## $ team_a    <chr> "POR", "GER", "POR", "FRA", "GER", "WAL", "POL", "EN...
## $ team_b    <chr> "FRA", "FRA", "WAL", "ISL", "ITA", "BEL", "POR", "IS...
## $ game_date <time> 2016-07-10 21:01:01, 2016-07-07 21:01:01, 2016-07-0...
## $ stage     <chr> "knockout", "knockout", "knockout", "knockout", "kno...

Read in bets:

em16_empty <- str_detect(pattern = "^$",string = em16bets1)
# remove empty lines
em16bets2 <- em16bets1[!em16_empty]
em16_split <- which(str_detect(pattern = "^#",string = em16bets2))

# find lengths of prospective groups
grnames=em16bets2[em16_split]
lengths <- em16_split[2:length(em16_split)] - em16_split[1:(length(em16_split)-1)]
.length=c()
.length[grnames] = c(lengths, length(em16bets2) - em16_split[length(em16_split)]+1 ) 

# Take the group names to the split list: 
# by changing rep() call to replicate the names, 
# then remove duplicated first element
em16bets3 <- map(split(em16bets2, rep.int(names(.length), .length)), function(x){x[-1]})

# example
#head(em16bets3[["#;51;POR;FRA;2016-07-10 21:01:01"]], 2)


# Remove long names, replace with unique numbers of games
names(em16bets3) <- str_match(names(em16bets3), "^#;(\\d+)")[,2]

Convert bets into list of data frames with meaningful column names

char2star <- function(x){
       paste0(
               str_sub(x,1,1),
               str_replace_all(str_sub(x,2,100), ".", "*")
               )
}

to_df <- function(x){
        tbl_df(data=x) %>% 
        separate(value, into=str_trim(c("player_last_name",
                                "player_first_name",
                                "team_a_goals","team_b_goals","is_joker")),
                 sep= ";")    %>%
        mutate(team_a_goals = as.integer(team_a_goals),
                team_b_goals = as.integer(team_b_goals) ,
               is_joker = ifelse(is_joker == "x", TRUE, FALSE))
}
em16bets4 <- map(em16bets3, to_df)

#warnings()
#head(em16bets4, 1)

em16bets5 <-lapply(names(em16bets4), function(x){
        em16bets4[[x]]["game_nr"] <- as.integer(x)
        em16bets4[[x]]
}) 
#head(em16bets5, 1) # has 1 more variable
em16bets6 <- do.call(bind_rows, em16bets5)  

#rm(em16bets1, em16bets3, em16bets4) # no longer necessary


em16bets6 <- em16bets6 %>%
        mutate(tendency= ifelse(team_a_goals == team_b_goals, "draw", ifelse(team_a_goals > team_b_goals, "team_a", "team_b")))

#glimpse(em16bets6) #

Extract Unique Players

players <- em16bets6 %>% 
        select(player_last_name, player_first_name) %>%
        mutate(player_last_name=str_trim(player_last_name), player_first_name=str_trim(player_first_name)) %>%
        unique() %>%
        arrange(player_last_name, player_first_name)

There are 286 players.

Read in: list of players who won prize money

players_winners <- read_csv("gfzock_prize-money-winners.csv") %>%
        separate(col=Player, into = c("player_last_name", "player_first_name"), sep=",", remove = TRUE) 

colnames(players_winners) <- tolower(make.names(colnames(players_winners)))

players_winners$player_first_name <- map_chr(players_winners$player_first_name, str_trim)

players_winners$player_last_name <- map_chr(players_winners$player_last_name, str_trim)


players_winners <- players_winners %>%
        rename( fraction_pct = portion...., amount_euro=prize..euro.)

There are 36 players who got prize money.

Join them with players table:

(We made sure by data cleanup that all winning entries can be mapped to the players table.)

players_winners$is_winner <- TRUE

players <- players %>%
        left_join(players_winners,  by=c( "player_last_name", "player_first_name")) %>%
        mutate(is_winner=! is.na(is_winner)) 

Create joint table of players and their bets

games_players <- em16bets6 %>%
        inner_join(em16games, by=c("game_nr"))

Read in final games scores

Read in final games scores as they were played.

em16games_final  <- read_tsv("real_games_scores_full.txt", col_names = FALSE)
names(em16games_final) <- c("team_a_long", "team_b_long", "result")
em16games_final <- em16games_final %>%
        separate(col=result, into = c("team_a_goals", "team_b_goals"), sep = ":") %>%
        mutate(team_a_goals= as.numeric(team_a_goals),team_b_goals= as.numeric(team_b_goals)) %>%
        mutate(team_a_name= toupper(str_sub(team_a_long, 1,3)), 
               team_b_name = toupper(str_sub( team_b_long, 1,3))) %>%
        select(team_a_long, team_a_name, team_a_goals, team_b_goals,  team_b_name, team_b_long)

# add draws
em16games_final <- em16games_final %>% mutate(is_draw = ifelse(team_a_goals == team_b_goals, TRUE, FALSE))

good.tmp    <- unique(c(em16games$team_a, em16games$team_b))
updthis.tmp <-    unique(c(em16games_final$team_a_name, em16games_final$team_b_name)) 
(updthis <- setdiff(updthis.tmp, good.tmp))
## [1] "SWI" "ICE" "NOR" "SLO" "ROM" "SPA" "AUS" "IRE"
(good <- setdiff(good.tmp, updthis.tmp))
## [1] "SUI" "ISL" "NIR" "SVK" "ROU" "ESP" "AUT" "IRL"
update_em16games_final <- function(x,y){
        em16games_final[em16games_final$team_a_name == x, "team_a_name"] <<- y
        em16games_final[em16games_final$team_b_name == x, "team_b_name"] <<- y
} 
# update in-place
walk2(updthis, good, update_em16games_final)


# add game_nr and date
em16games_final <- em16games_final %>%
        inner_join(em16games, by=c("team_a_name"="team_a", "team_b_name"="team_b")) %>%
        arrange(desc(game_nr))

# add tendency
em16games_final <- em16games_final %>% 
        mutate(tendency= ifelse(team_a_goals == team_b_goals, "draw", ifelse(team_a_goals > team_b_goals, "team_a", "team_b")))

num_draws <- em16games_final %>%
        filter(is_draw == TRUE) %>%
        count() %>%
        as.integer()

        
glimpse(em16games_final)
## Observations: 51
## Variables: 11
## $ team_a_long  <chr> "Portugal", "Germany", "Portugal", "France", "Ger...
## $ team_a_name  <chr> "POR", "GER", "POR", "FRA", "GER", "WAL", "POL", ...
## $ team_a_goals <dbl> 1, 0, 2, 5, 2, 3, 1, 1, 2, 0, 3, 2, 0, 1, 1, 0, 0...
## $ team_b_goals <dbl> 0, 2, 0, 2, 1, 1, 2, 2, 0, 4, 0, 1, 1, 0, 2, 1, 1...
## $ team_b_name  <chr> "FRA", "FRA", "WAL", "ISL", "ITA", "BEL", "POR", ...
## $ team_b_long  <chr> "France", "France", "Wales", "Iceland", "Italy", ...
## $ is_draw      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
## $ game_nr      <int> 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 3...
## $ game_date    <time> 2016-07-10 21:01:01, 2016-07-07 21:01:01, 2016-0...
## $ stage        <chr> "knockout", "knockout", "knockout", "knockout", "...
## $ tendency     <chr> "team_a", "team_b", "team_a", "team_a", "team_a",...

Analysis of games played, bets made

Player participation rate

The next plot shows that the betting activity of participants varied quite a bit during the tournament. Fewer people placed bets at the end of the tournament. But also at the beginning many people forgot to enter scores. Or there were data prepocessing errors made by me. Or the data dump was incomplete. (I had no time to investigate).

player_participation <- em16bets6 %>% 
        select(game_nr) %>%
        group_by(game_nr) %>%
        count() %>%
        data.frame() %>%
        inner_join(em16games, by=c("game_nr"))

ggplot(player_participation, aes(game_nr, n, color=stage)) +
        geom_point(size=1, shape=21) +
        geom_line(alpha=0.5) +
        geom_smooth(method="rlm", se=FALSE) +
        ylab("# of bets issued per game") +
        ggtitle("Betting activity during the Euro 2016")

It also shows that many people did not bother to place a bet for the last game, the final. Maybe only motivated, knowledgeable players lasted through the end?

Is there a bias of position (“Home team” vs “Away Team”)?

ggplot(games_players, aes(team_a_goals, color="Team A")) +
               geom_density(color="blue") +
        geom_density(data=games_players, aes(team_b_goals, color="Team B"), color="red") +
        ggtitle("Scores of 'home' team (blue) vs 'away' team (red)")

Yes. The ‘home’ team seems to be preferred by the participants. They think the home teams will, on average, score more goals than the ‘away’ team.

By the way, during the Euro 2016, all teams were actually ‘Away’ teams, except for the French team, of course. I’ll just stick to the naming convention.

Visualisation of aggregates scores

Prepare dataset for creating 1d-histograms of games.

#colnames(games_players)
games_players_plottable <- games_players %>%
        select(game_nr, team_a, team_a_goals, team_b_goals, team_b ) %>%
        gather(key, value, -game_nr, -team_a, -team_b) %>%
        unite(col = "game_name",team_a, team_b, sep =  "-")

ggplot(games_players_plottable, aes( value, fill=key, color=key,show.legend = FALSE)) +
        geom_bar(alpha = 1 / 3, position = "nudge",show.legend = FALSE) +
        scale_color_manual(values=c( "blue", "red")) +
        scale_fill_manual(values=c( "blue", "red")) +
        xlab("Number of goals scored (predicted)") +
        ylab("Count/Frequency") +
        ggtitle("'Home' team is blue, 'Away' team is red") +
        #geom_vline(aes(xintercept=mean(value, na.rm=TRUE), width=0.5), show.legend = FALSE) +
        facet_wrap(~game_name, ncol=4) 

Prepare dataset for creating 2d-histograms of games.

games_players_plot2d <- games_players %>%
        select(game_nr, team_a, team_a_goals, team_b_goals, team_b ) %>%
        unite(col = "game_name",team_a, team_b, sep =  "-", remove = FALSE)

# temporary dataframe needed for plaing labels inside the plots
df_team <- data.frame(
        # coordinates for upper left , and lower right corner
        y = rep(0, 51),
        x = rep(6, 51),
        #team_a = em16games[order(em16games$game_nr),"team_a"],
        #team_b = em16games[order(em16games$game_nr),"team_b"],
        game_nr = sort(em16games$game_nr)
) 

games_players_plot2d <- games_players_plot2d %>%
        inner_join(df_team, by="game_nr")

A better visualisation

Create array of xy-plots. Circles show predicted number of goals, for the respective opponents of each game.

ggplot(games_players_plot2d, aes(team_a_goals, team_b_goals, show.legend = FALSE)) +
        geom_count(alpha = 0.3, show.legend = FALSE) + 
#        geom_density2d(alpha = 0.3, show.legend = FALSE) + 
        geom_abline(slope = 1, colour = "red", size = 0.5, alpha = 0.2,show.legend = FALSE) +
        geom_text(aes(x, y, label = team_a, size=130), vjust = "inward", check_overlap = TRUE, show.legend = FALSE) +
        geom_text(aes(y, x, label = team_b, size=130), hjust = "inward", check_overlap = TRUE, show.legend = FALSE) +
        ggtitle("Euro 2016: Number of goals scored - all predictions") +
        facet_wrap(~game_nr, ncol = 3)

These plots above show more clearly the distribution, in particular for the knockout roud, where more penalty shootouts have been predicted.

‘Joker’ Games

Which games were preferred for ‘Jokers’?

Participants can pick a few ‘Joker games’, doubling the betting scores.

jokers_top <- games_players %>%
  dplyr::filter(is_joker == TRUE) %>%
  unite(col = "game_name",team_a, team_b, sep =  "-")  %>%
  select(game_name, is_joker) %>%
  group_by(game_name) %>%
  count(is_joker) 
ggplot(jokers_top, aes(reorder(game_name, -n),n)) +
        geom_bar(stat="identity", color="grey", fill="skyblue") +
        geom_text(aes(label =  n), vjust = "inward", size=2) +
        theme(axis.text.y = element_text(size=10)) + 
        xlab("Euro 2016 Game") +
        ylab("Count/Frequency (Picked as Joker n times)") +
        coord_flip() +
        ggtitle("Most popular soccer matches for 'joker' bets")

Of course, Germany’s games (from group stage as well as K.O. stage) were preferred by the players. Additionally there is some bias towards teams participating in the quarter- and semi-finals. Choices are limited for these games, all participants have to pick from these. Therefore, Portugal’s games stand out.

However, games from early in the tournament (FRA-ROU, FRA-ALB) were also picked relatively often.

Surprisingly, games of the former Euro 2012 champion, Spain, were not particularly popular as ‘joker’ games. Maybe the community didn’t expect them to win clearly, as Spain’s group was tough.

Winning Players of the betting game

Which players most often bet for draws?

players_prefer_draw <- games_players %>%
        filter(team_a_goals == team_b_goals) %>%
        select(player_last_name, player_first_name, game_nr) %>%
        group_by(player_last_name, player_first_name) %>%
        count(sort=TRUE) 

n_draw_median <- median(players_prefer_draw$n)

There were 11 draws in the Euro 2016, out of 36 matches in the group phase. There were 51 matches.

The median value for “number of matches predicted as draw” is n=6. Few people (n=40) have predicted at least 11 draws, though:

Is ‘predicting a lot of draws’ a winning strategy?

players_prefer_draw <- players_prefer_draw %>%
        left_join(players, by=c("player_last_name", "player_first_name")) %>%
        select(player_last_name, player_first_name, n_draws_pred=n, money_rank=rank) %>%
        mutate(last_name=char2star(player_last_name),
               first_name=char2star(player_first_name)) %>% 
        filter(! is.na(money_rank)) 

players_prefer_draw_winners <- players_prefer_draw %>% 
#        select(5,6,3,4) %>% 
        filter(n_draws_pred > n_draw_median) 

Yes. Of the 36 prize money winners, n=26 (72%) have predicted more than the median number of draws.

kable(players_prefer_draw_winners %>% data.frame() %>% select(5,6,3,4))
last_name first_name n_draws_pred money_rank
E* M***** 15 14
M******** J**** 14 19
N***** D** 14 2
N***** V************** 14 3
M***** F****** 13 30
N***** V******** 13 21
K***** R****** 12 10
M*** C******** 11 30
B******* T***** 10 17
K***** F**** 10 17
R**** T****** 10 15
S*********** A***** 10 30
U***** K****** 10 30
B********* F**** 9 1
F***** A**** 9 4
G* H**** 9 28
H**** D*** 9 30
K**** M******* 9 23
P*** S******* 9 11
R**** D***** 9 30
S****** L**** 9 7
W******** I*** 9 22
K******* F**** 7 23
L******* D**** 7 5
M**** K****** 7 11
S********** B******* 7 5

Analysis of games

Which games where predicted successfully?

em16bets7 <- em16bets6 %>% 
        inner_join(em16games_final, by=c("team_a_goals", "team_b_goals", "game_nr")) 

corr_pred_by <- em16bets7  %>%
        mutate(score = paste0("(",team_a_goals, ":", team_b_goals, ")")) %>% 
        select(game_nr, team_a_name, team_b_name, score, is_draw) %>%
        group_by(game_nr, team_a_name, team_b_name, score, is_draw) %>%
        summarise(corr_pred_by=n()) %>%
        arrange(desc(corr_pred_by)) %>%
        mutate(is_draw = ifelse(is_draw == TRUE, "draw", "")) 

There were 1618 games were predicted correctly by these many unique players. Success rate: 1618/13474 = 12%

Which game scores did nobody predict correctly?

surprise_results_ids <- setdiff(em16games$game_nr,corr_pred_by$game_nr)
surprise_results <- em16games_final[em16games_final$game_nr %in% surprise_results_ids,]  %>%
        mutate(score = paste0("(",team_a_goals, ":", team_b_goals, ")")) %>% 
        select(game_nr, team_a_name, team_b_name, score, is_draw) %>%
        mutate(is_draw = ifelse(is_draw == TRUE, "draw", "")) %>%
        mutate(corr_pred_by=0) 
                       
kable(surprise_results)
game_nr team_a_name team_b_name score is_draw corr_pred_by
48 FRA ISL (5:2) 0
42 HUN BEL (0:4) 0
33 HUN POR (3:3) draw 0
28 RUS WAL (0:3) 0
corr_pred_by <- bind_rows(corr_pred_by, surprise_results)

What about getting the tendency right?

em16bets8 <- em16bets6 %>%
        select(-team_a_goals, -team_b_goals) %>%
        inner_join(em16games_final, by=c("tendency", "game_nr"))

tendency_pred_by <- em16bets8  %>%
        mutate(score = paste0("(",team_a_goals, ":", team_b_goals, ")")) %>% 
        select(game_nr, team_a_name, team_b_name, score, is_draw) %>%
        group_by(game_nr, team_a_name, team_b_name, score, is_draw) %>%
        summarise(tend_pred_by=n()) %>%
        arrange(desc(tend_pred_by)) %>%
        mutate(is_draw = ifelse(is_draw == TRUE, "draw", ""))

corr_pred_by <- tendency_pred_by %>%
        inner_join(corr_pred_by, by=c("game_nr", "team_a_name", "team_b_name", "score", "is_draw"))
kable(corr_pred_by) # %>% filter(tend_pred_by %in% 80:200))
game_nr team_a_name team_b_name score is_draw tend_pred_by corr_pred_by
15 FRA ALB (2:0) 275 118
1 FRA ROU (2:1) 265 49
7 GER UKR (2:0) 264 88
29 NIR GER (0:1) 262 29
21 ESP TUR (3:0) 249 18
8 ESP CZE (1:0) 243 27
41 GER SVK (3:0) 241 16
6 POL NIR (1:0) 234 59
40 FRA IRL (2:1) 222 55
2 ALB SUI (0:1) 220 75
38 WAL NIR (1:0) 213 84
16 ENG WAL (2:1) 205 83
22 BEL IRL (3:0) 204 4
30 UKR POL (0:1) 202 68
19 ITA SWE (1:0) 197 87
47 GER ITA (2:1) 192 81
35 SWE BEL (0:1) 190 45
37 SUI POL (1:2) 178 56
42 HUN BEL (0:4) 174 0
48 FRA ISL (5:2) 156 0
5 TUR CRO (0:1) 146 41
28 RUS WAL (0:3) 120 0
45 POL POR (1:2) 120 57
43 ITA ESP (2:0) 99 6
49 POR WAL (2:0) 99 10
23 ISL HUN (1:1) draw 96 69
10 BEL ITA (0:2) 91 14
51 POR FRA (1:0) 88 17
14 ROU SUI (1:1) draw 82 73
44 ENG ISL (1:2) 82 30
25 SUI FRA (0:0) draw 81 15
34 ISL AUT (2:1) 76 18
20 CZE CRO (2:2) draw 75 7
3 WAL SVK (2:1) 69 30
9 IRL SWE (1:1) draw 62 41
50 GER FRA (0:2) 59 5
39 CRO POR (0:1) 58 14
13 RUS SVK (1:2) 56 28
46 WAL BEL (3:1) 52 1
32 CZE TUR (0:2) 50 10
27 SVK ENG (0:0) draw 46 9
18 GER POL (0:0) draw 44 1
4 ENG RUS (1:1) draw 41 29
33 HUN POR (3:3) draw 41 0
24 POR AUT (0:0) draw 37 3
26 ROU ALB (0:1) 34 17
17 UKR NIR (0:2) 33 2
11 AUT HUN (0:2) 30 3
12 POR ISL (1:1) draw 23 17
31 CRO ESP (2:1) 10 7
36 ITA IRL (0:1) 6 2

Plot all of them together

Note that in the first plot below, the prediction frequencies in columns ‘tend_pred_by’ and ‘corr_pred_by’ have been ranked twice, independently of each other. In the second plot below, the prediction freqencies from rows on the table above plot on vertical lines in the plot.

corr_pred_by.2 <- corr_pred_by %>% 
        arrange(desc(corr_pred_by)) 

corr_pred_by.2$pos <- 1:nrow(corr_pred_by.2)

surprise_results_tend <- corr_pred_by.2 %>% filter(tend_pred_by - corr_pred_by <=20)

ggplot(corr_pred_by, aes(seq_along(game_nr),tend_pred_by)) +
        geom_point(color="gold") +
        geom_point(data=corr_pred_by.2, aes(seq_along(game_nr),corr_pred_by), color="darkgreen") +
        xlab("Games ranked by prediction accuracy") +
        ylab("# of correct bets") +
        ggtitle("Yellow: Tendency right\nGreen: Score right")

#devtools::install_github("slowkow/ggrepel")

ggplot(corr_pred_by.2, aes(seq_along(game_nr),corr_pred_by)) +
        geom_point(color="darkgreen") +
        geom_point(data=corr_pred_by.2, aes(seq_along(game_nr),tend_pred_by), color="gold") +
        geom_point(data=surprise_results_tend, aes(pos,tend_pred_by), color="black", size=5,shape=21) +
        xlab("Games ranked by prediction accuracy") +
        ylab("# of correct bets") +
        ggtitle("Yellow: Tendency right\nGreen: Score right\nBlack encircled: Surprising games") +
        geom_text_repel(data=corr_pred_by.2,
                        aes(seq_along(game_nr),tend_pred_by, label=str_c(team_a_name, team_b_name, sep =  ":")),  size=2  )

Closing words about surprising games

In the plot above, a few games are marked with a black circle. This table shows these marked games that were surprising: Few participants got the tendency right. Necessarily, this implies that they had to get the absolute score almost right.

kable(surprise_results_tend)
game_nr team_a_name team_b_name score is_draw tend_pred_by corr_pred_by pos
14 ROU SUI (1:1) draw 82 73 8
4 ENG RUS (1:1) draw 41 29 22
26 ROU ALB (0:1) 34 17 28
12 POR ISL (1:1) draw 23 17 29
31 CRO ESP (2:1) 10 7 38
36 ITA IRL (0:1) 6 2 45