Load

Libraries

library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

Load data and set theme

setwd("~/")
results <- read_csv("ROI/results.csv", col_types = cols(date = col_date(format = "%Y-%m-%d")))

theme_set(theme_light())

Process

Process data

processed <- results %>%
  ###determine whether home team won or not
mutate(home_result = case_when(home_score > away_score ~ "win", home_score < away_score ~ "lose", home_score == away_score ~ "draw")) %>% 
  ###and by what score
mutate(by = home_score - away_score) %>% 
  ###create a new numeric column called "year" by parsing the year from the "date" column
  mutate(year = as.numeric(format(date, "%Y")))

Create list of all teams

list <- 
  processed %>% 
  select(home_team,away_team) %>% 
  gather(key,team) %>% 
  distinct(team)

Assign names to subsetted teams based on list of all teams. First attempt at loop

for (team in list$team){
  assign(team, filter(processed, home_team ==  team | away_team ==  team))
  }

Analysis

Number of games per year

There has been a general increase in the number of football games year on year. The 90s saw the biggest upturn in games per year. Recently the trend has leveled off

processed %>% 
  ### group by year
  group_by(year) %>% 
  ### count the number of rows (games) per year
  count() %>% 
  ggplot(aes(year,n)) +
  geom_line(size =1, colour = "#F8766D") +
  scale_x_continuous(breaks=seq(1870, 2020, 10)) +
  labs(x="Year", y= "Number of games played")

Impact of world cups and world wars on games played

World Wars 1 and 2 saw a dip in the number of games played per year. World cup years result in less games being played

### create list with just world cup years
wc_years <- c(1930, 1934, 1938, seq(1950, 2014, 4))


### create a table
wc_graph <- processed %>% 
  ### group by year (year column)
  group_by(year) %>% 
  ### count the number of rows (games) per year (n column)
  count %>%
  ### create true/false column based on whether the year (column 1) is in the list wc_years
  mutate(is_wc = year %in% wc_years)
  
ggplot(wc_graph, aes(year, n)) +
  ### basically as previous graph
  geom_line(size =1, colour = "#F8766D") +
  ### only selected rows (years) with is_wc = TRUE from our newly created table, add points. "data =" is required
  geom_point(data = wc_graph %>%  filter(is_wc), colour = "#7CAE00", size =3) +
  ### add vertical lines for world wars
  geom_vline(xintercept = c(1914, 1918, 1939, 1945), color = "#619CFF", size = 1) +
  scale_x_continuous(breaks=seq(1870, 2020, 10)) +
  labs(x="Year", y= "Number of games played")

Debut teams

The 1920s saw the debut of a large amount of teams. The 1950s and 1960s also saw a lot of new teams. There has continued to be steady number of new teams in recent years as well

firsthomegame <- processed %>% 
    distinct(home_team, year) %>% 
    rename(team = home_team)

firstawaygame <- processed %>% 
    distinct(away_team, year) %>% 
    rename(team = away_team)

firstgame <- rbind(firstawaygame, firsthomegame) %>% 
  arrange(year) %>% 
  distinct(team, .keep_all = TRUE)


firstgame %>% 
  group_by(year) %>% 
  count() %>% 
  ggplot() +
  geom_col(aes(year,n), colour = "black", fill = "#F8766D") +
  scale_x_continuous(breaks=seq(1870, 2020, 10)) +
  scale_y_continuous(breaks = seq(0,10,1)) +
  labs(x = "Year", y = "Number of debut teams per year")

Lets look at the debuts of select teams in international football

England and Scotland were the first two teams. The US was also on early adopter. Newer teams include Czech Repblic and South Sudan

firstgameselect <- firstgame %>% 
    filter(team %in% c("England", "Brazil", "Germany", "Republic of Ireland", "Hungary", "Germany", "Scotland", "Wales", "Croatia", "Argentina", "South Sudan","France", "Italy", "Uruguay", "United States", "Czech Republic")) %>% 
  mutate(team = fct_reorder(team, year, .desc = TRUE))

ggplot(firstgameselect) +
  geom_point(aes(team, year, colour = team), show.legend = FALSE) +
  coord_flip() +
  labs(x = "Team", y = "Debut") +
  scale_y_continuous(breaks=seq(1870, 2020, 10))

Win rates of select teams

Brazil have been consistantly good. Hungary were great in the 50s

Filter for England

England <- processed %>%
  filter(home_team == "England" | away_team == "England") %>% 
  ### determine whether England won/lost/drew
   mutate(team_result = case_when(home_team == "England" & home_result == "win" ~ "win", home_team == "England" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "England" & home_result == "lose" ~ "win", away_team == "England" & home_result == "win" ~ "lose" ))

Filter for Ireland

`Republic of Ireland` <- processed %>%
  filter(home_team == "Republic of Ireland" | away_team == "Republic of Ireland") %>% 
  ### determine whether Ireland won/lost/drew
   mutate(team_result = case_when(home_team == "Republic of Ireland" & home_result == "win" ~ "win", home_team == "Republic of Ireland" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "Republic of Ireland" & home_result == "lose" ~ "win", away_team == "Republic of Ireland" & home_result == "win" ~ "lose" ))

Filter for Brazil

Brazil <- processed %>% 
  filter(home_team == "Brazil" | away_team == "Brazil") %>% 
  ### determine whether Brazil won/lost/drew
   mutate(team_result = case_when(home_team == "Brazil" & home_result == "win" ~ "win", home_team == "Brazil" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "Brazil" & home_result == "lose" ~ "win", away_team == "Brazil" & home_result == "win" ~ "lose" ))

Filter for Hungary

Hungary <- processed %>% 
  filter(home_team == "Hungary" | away_team == "Hungary") %>% 
  ### determine whether Hungary won/lost/drew
   mutate(team_result = case_when(home_team == "Hungary" & home_result == "win" ~ "win", home_team == "Hungary" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "Hungary" & home_result == "lose" ~ "win", away_team == "Hungary" & home_result == "win" ~ "lose" ))

Graph select teams

`Republic of IrelandTimeline` <- `Republic of Ireland` %>% 
  ###group by year and give a count of the results so we have up to 3 rows per year (win/lose/draw)
  group_by(year) %>% 
  count(team_result) %>%
  ###create a new column that divides the n value for the result by the sum of the n, ie total games per year. All done when grouped
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(year,percent,fill=team_result)) +
  geom_col() + 
  ### manual colours
  scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
  ###xlimit
  xlim(1870,2020) +
  labs(y = "% of games", title = "Ireland") +
  scale_y_continuous(labels = percent)
  
BrazilTimeline <- Brazil %>% 
  group_by(year) %>% 
  count(team_result) %>% 
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(year,percent,fill=team_result)) +
  geom_col() + 
  scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
  xlim(1870,2020)+
  labs(y = "% of games", title = "Brazil") +
  scale_y_continuous(labels = percent)

EnglandlTimeline <- England %>% 
  group_by(year) %>% 
  count(team_result) %>% 
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(year,percent,fill=team_result)) +
  geom_col() + 
  scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
  xlim(1870,2020)+
  labs(y = "% of games", title = "England") +
  scale_y_continuous(labels = percent)

HungarylTimeline <- Hungary %>% 
  group_by(year) %>% 
  count(team_result) %>% 
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(year,percent,fill=team_result)) +
  geom_col() + 
  scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
  xlim(1870,2020)+
  labs(y = "% of games", title = "Hungary") +
  scale_y_continuous(labels = percent)


plot_grid(`Republic of IrelandTimeline`,BrazilTimeline,EnglandlTimeline, HungarylTimeline, ncol =1)