Import data
# excel filer
games <- read_excel("../00_data/MyData_charts.xlsx")
games
## # A tibble: 988 × 15
## year country city stage home_team away_team home_score away_score outcome
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 1930 Uruguay Montev… Grou… France Mexico 4 1 H
## 2 1930 Uruguay Montev… Grou… Belgium United S… 0 3 A
## 3 1930 Uruguay Montev… Grou… Brazil Yugoslav… 1 2 A
## 4 1930 Uruguay Montev… Grou… Peru Romania 1 3 A
## 5 1930 Uruguay Montev… Grou… Argentina France 1 0 H
## 6 1930 Uruguay Montev… Grou… Chile Mexico 3 0 H
## 7 1930 Uruguay Montev… Grou… Bolivia Yugoslav… 0 4 A
## 8 1930 Uruguay Montev… Grou… Paraguay United S… 0 3 A
## 9 1930 Uruguay Montev… Grou… Uruguay Peru 1 0 H
## 10 1930 Uruguay Montev… Grou… Argentina Mexico 6 3 H
## # ℹ 978 more rows
## # ℹ 6 more variables: win_conditions <chr>, winning_team <chr>,
## # losing_team <chr>, date <dttm>, month <chr>, dayofweek <chr>
Select colums
select(games, home_team, home_score, away_team, away_score) %>%
#arrange home score
arrange(desc(home_score))
## # A tibble: 988 × 4
## home_team home_score away_team away_score
## <chr> <dbl> <chr> <dbl>
## 1 <NA> 177 <NA> NA
## 2 Hungary 10 El Salvador 1
## 3 Hungary 9 South Korea 0
## 4 Yugoslavia 9 Zaire 0
## 5 Germany 8 Saudi Arabia 0
## 6 Italy 7 United States 1
## 7 Brazil 7 Sweden 1
## 8 West Germany 7 Turkey 2
## 9 France 7 Paraguay 3
## 10 Portugal 7 North Korea 0
## # ℹ 978 more rows
Add columns
mutate(games,
difference = home_score - away_score) %>%
#Select home team, away team, home score, away score and difference
select(home_team: away_score, difference, winning_team)
## # A tibble: 988 × 6
## home_team away_team home_score away_score difference winning_team
## <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 France Mexico 4 1 3 France
## 2 Belgium United States 0 3 -3 United States
## 3 Brazil Yugoslavia 1 2 -1 Yugoslavia
## 4 Peru Romania 1 3 -2 Romania
## 5 Argentina France 1 0 1 Argentina
## 6 Chile Mexico 3 0 3 Chile
## 7 Bolivia Yugoslavia 0 4 -4 Yugoslavia
## 8 Paraguay United States 0 3 -3 United States
## 9 Uruguay Peru 1 0 1 Uruguay
## 10 Argentina Mexico 6 3 3 Argentina
## # ℹ 978 more rows
#Just keep difference
mutate(games,
difference = home_score - away_score) %>%
#Select difference
select(difference)
## # A tibble: 988 × 1
## difference
## <dbl>
## 1 3
## 2 -3
## 3 -1
## 4 -2
## 5 1
## 6 3
## 7 -4
## 8 -3
## 9 1
## 10 3
## # ℹ 978 more rows
Group by
games %>%
# Remove missing values
filter(!is.na(home_team)) %>%
filter(!is.na(home_score)) %>%
select(home_team, home_score) %>%
group_by(home_team)
## # A tibble: 900 × 2
## # Groups: home_team [81]
## home_team home_score
## <chr> <dbl>
## 1 France 4
## 2 Belgium 0
## 3 Brazil 1
## 4 Peru 1
## 5 Argentina 1
## 6 Chile 3
## 7 Bolivia 0
## 8 Paraguay 0
## 9 Uruguay 1
## 10 Argentina 6
## # ℹ 890 more rows
Summarize by group
games %>%
# Group by year
group_by(home_team) %>%
# Calculate average home score per year
summarise(year = mean(home_score)) %>%
# Sort it
arrange(home_team)
## # A tibble: 82 × 2
## home_team year
## <chr> <dbl>
## 1 Algeria 0.833
## 2 Angola 0
## 3 Argentina 1.9
## 4 Australia 1.17
## 5 Austria 1.5
## 6 Belgium 1.73
## 7 Bolivia 0.25
## 8 Bosnia and Herzegovina 3
## 9 Brazil 2.11
## 10 Bulgaria 1.31
## # ℹ 72 more rows
Plot
games %>%
# Remove missing values
filter(!is.na(home_team)) %>%
filter(!is.na(home_score)) %>%
select(home_team, home_score) %>%
group_by(home_team) %>%
# Plot
ggplot(mapping = aes(x = home_team, y = home_score)) +
geom_bar(stat = "identity") +
labs(x = "X-axis label", y = "Y-axis label") +
theme(axis.text.x = element_text(angle = 90, hjust= 1, size = 5))
