Libraries
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
Load data and set theme
setwd("~/")
results <- read_csv("ROI/results.csv", col_types = cols(date = col_date(format = "%Y-%m-%d")))
theme_set(theme_light())
Process data
processed <- results %>%
###determine whether home team won or not
mutate(home_result = case_when(home_score > away_score ~ "win", home_score < away_score ~ "lose", home_score == away_score ~ "draw")) %>%
###and by what score
mutate(by = home_score - away_score) %>%
###create a new numeric column called "year" by parsing the year from the "date" column
mutate(year = as.numeric(format(date, "%Y")))
Create list of all teams
list <-
processed %>%
select(home_team,away_team) %>%
gather(key,team) %>%
distinct(team)
Assign names to subsetted teams based on list of all teams. First attempt at loop
for (team in list$team){
assign(team, filter(processed, home_team == team | away_team == team))
}
There has been a general increase in the number of football games year on year. The 90s saw the biggest upturn in games per year. Recently the trend has leveled off
processed %>%
### group by year
group_by(year) %>%
### count the number of rows (games) per year
count() %>%
ggplot(aes(year,n)) +
geom_line(size =1, colour = "#F8766D") +
scale_x_continuous(breaks=seq(1870, 2020, 10)) +
labs(x="Year", y= "Number of games played")
World Wars 1 and 2 saw a dip in the number of games played per year. World cup years result in less games being played
### create list with just world cup years
wc_years <- c(1930, 1934, 1938, seq(1950, 2014, 4))
### create a table
wc_graph <- processed %>%
### group by year (year column)
group_by(year) %>%
### count the number of rows (games) per year (n column)
count %>%
### create true/false column based on whether the year (column 1) is in the list wc_years
mutate(is_wc = year %in% wc_years)
ggplot(wc_graph, aes(year, n)) +
### basically as previous graph
geom_line(size =1, colour = "#F8766D") +
### only selected rows (years) with is_wc = TRUE from our newly created table, add points. "data =" is required
geom_point(data = wc_graph %>% filter(is_wc), colour = "#7CAE00", size =3) +
### add vertical lines for world wars
geom_vline(xintercept = c(1914, 1918, 1939, 1945), color = "#619CFF", size = 1) +
scale_x_continuous(breaks=seq(1870, 2020, 10)) +
labs(x="Year", y= "Number of games played")
The 1920s saw the debut of a large amount of teams. The 1950s and 1960s also saw a lot of new teams. There has continued to be steady number of new teams in recent years as well
firsthomegame <- processed %>%
distinct(home_team, year) %>%
rename(team = home_team)
firstawaygame <- processed %>%
distinct(away_team, year) %>%
rename(team = away_team)
firstgame <- rbind(firstawaygame, firsthomegame) %>%
arrange(year) %>%
distinct(team, .keep_all = TRUE)
firstgame %>%
group_by(year) %>%
count() %>%
ggplot() +
geom_col(aes(year,n), colour = "black", fill = "#F8766D") +
scale_x_continuous(breaks=seq(1870, 2020, 10)) +
scale_y_continuous(breaks = seq(0,10,1)) +
labs(x = "Year", y = "Number of debut teams per year")
England and Scotland were the first two teams. The US was also on early adopter. Newer teams include Czech Repblic and South Sudan
firstgameselect <- firstgame %>%
filter(team %in% c("England", "Brazil", "Germany", "Republic of Ireland", "Hungary", "Germany", "Scotland", "Wales", "Croatia", "Argentina", "South Sudan","France", "Italy", "Uruguay", "United States", "Czech Republic")) %>%
mutate(team = fct_reorder(team, year, .desc = TRUE))
ggplot(firstgameselect) +
geom_point(aes(team, year, colour = team), show.legend = FALSE) +
coord_flip() +
labs(x = "Team", y = "Debut") +
scale_y_continuous(breaks=seq(1870, 2020, 10))
Brazil have been consistantly good. Hungary were great in the 50s
Filter for England
England <- processed %>%
filter(home_team == "England" | away_team == "England") %>%
### determine whether England won/lost/drew
mutate(team_result = case_when(home_team == "England" & home_result == "win" ~ "win", home_team == "England" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "England" & home_result == "lose" ~ "win", away_team == "England" & home_result == "win" ~ "lose" ))
Filter for Ireland
`Republic of Ireland` <- processed %>%
filter(home_team == "Republic of Ireland" | away_team == "Republic of Ireland") %>%
### determine whether Ireland won/lost/drew
mutate(team_result = case_when(home_team == "Republic of Ireland" & home_result == "win" ~ "win", home_team == "Republic of Ireland" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "Republic of Ireland" & home_result == "lose" ~ "win", away_team == "Republic of Ireland" & home_result == "win" ~ "lose" ))
Filter for Brazil
Brazil <- processed %>%
filter(home_team == "Brazil" | away_team == "Brazil") %>%
### determine whether Brazil won/lost/drew
mutate(team_result = case_when(home_team == "Brazil" & home_result == "win" ~ "win", home_team == "Brazil" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "Brazil" & home_result == "lose" ~ "win", away_team == "Brazil" & home_result == "win" ~ "lose" ))
Filter for Hungary
Hungary <- processed %>%
filter(home_team == "Hungary" | away_team == "Hungary") %>%
### determine whether Hungary won/lost/drew
mutate(team_result = case_when(home_team == "Hungary" & home_result == "win" ~ "win", home_team == "Hungary" & home_result == "lose" ~ "lose", home_result == "draw" ~ "draw", away_team == "Hungary" & home_result == "lose" ~ "win", away_team == "Hungary" & home_result == "win" ~ "lose" ))
Graph select teams
`Republic of IrelandTimeline` <- `Republic of Ireland` %>%
###group by year and give a count of the results so we have up to 3 rows per year (win/lose/draw)
group_by(year) %>%
count(team_result) %>%
###create a new column that divides the n value for the result by the sum of the n, ie total games per year. All done when grouped
mutate(percent = n / sum(n)) %>%
ggplot(aes(year,percent,fill=team_result)) +
geom_col() +
### manual colours
scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
###xlimit
xlim(1870,2020) +
labs(y = "% of games", title = "Ireland") +
scale_y_continuous(labels = percent)
BrazilTimeline <- Brazil %>%
group_by(year) %>%
count(team_result) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(year,percent,fill=team_result)) +
geom_col() +
scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
xlim(1870,2020)+
labs(y = "% of games", title = "Brazil") +
scale_y_continuous(labels = percent)
EnglandlTimeline <- England %>%
group_by(year) %>%
count(team_result) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(year,percent,fill=team_result)) +
geom_col() +
scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
xlim(1870,2020)+
labs(y = "% of games", title = "England") +
scale_y_continuous(labels = percent)
HungarylTimeline <- Hungary %>%
group_by(year) %>%
count(team_result) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(year,percent,fill=team_result)) +
geom_col() +
scale_fill_manual(values = c("grey", "tomato", "yellowgreen")) +
xlim(1870,2020)+
labs(y = "% of games", title = "Hungary") +
scale_y_continuous(labels = percent)
plot_grid(`Republic of IrelandTimeline`,BrazilTimeline,EnglandlTimeline, HungarylTimeline, ncol =1)