library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library("ggplot2")

raw_results = read_csv("results.csv")
## Rows: 43451 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): home_team, away_team, tournament, city, country
## dbl  (2): home_score, away_score
## lgl  (1): neutral
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

First let’s check what are the most repeated matches in history

For doing that, I need to get the same order of the teams (no matter which team was home or away) So I use the pmin function to get the first team alphabetically, and pmax to get the last team alphabetically That way I have the same teams order in each row

results = raw_results %>%
  mutate(alphabetical_team_1=pmin(home_team,away_team),
         alphabetical_team_2=pmax(home_team,away_team),
         alphabetical_teams=paste(alphabetical_team_1,alphabetical_team_2))

Once I have the teams in the same order, I will group by match teams, count and order by what are the most repeated matches

most_played_matches = results %>%
  group_by(alphabetical_teams) %>%
  summarise(number_of_matches = n()) %>%
  arrange(-number_of_matches)


print(most_played_matches)
## # A tibble: 6,945 × 2
##    alphabetical_teams  number_of_matches
##    <chr>                           <int>
##  1 Argentina Uruguay                 179
##  2 Austria Hungary                   137
##  3 Belgium Netherlands               125
##  4 England Scotland                  117
##  5 Kenya Uganda                      109
##  6 Argentina Brazil                  107
##  7 Denmark Sweden                    107
##  8 Norway Sweden                     107
##  9 Scotland Wales                    106
## 10 Argentina Paraguay                105
## # … with 6,935 more rows

As we can see, the most played match is Argentina v Uruguay. Lets see a plot with the top 10 most repeated matches

ggplot(data = most_played_matches[1:10,]) + 
  geom_bar(mapping=aes(y = reorder(alphabetical_teams, number_of_matches), 
                       x = number_of_matches,
                       fill = alphabetical_teams),stat='identity') + 
  theme(legend.position = "none") +
  labs(title = "Most played international matches ever", x = "Times played", y = "Teams")

Now lets check what are the teams with most wins To do so, lets create a new column where we have the winner of each match

results = results %>%
  mutate(team=ifelse(
    home_score == away_score,
    "Draw",
    ifelse(
    home_score > away_score,
    home_team,
    away_team
  )))

Now lets see what are the top 10 teams with the most wins

limit_date = "2022-10-17"
most_winning_teams = results %>%
  filter(date<limit_date)

most_winning_teams = most_winning_teams %>%
  group_by(team) %>%
  filter(team != "Draw") %>%
  summarise(wins=n()) %>%
  arrange(-wins)

ggplot(data = most_winning_teams[1:10,]) + 
  geom_bar(mapping = aes(y = wins, 
                         x = reorder(team,-wins),
                         fill = team),
           stat='identity') +
  labs(title = "Most wins by team", y = "Wins", x = "Team") +
  theme(legend.position = "none")

Since every team has played a different number of matches Let’s see what are the teams with most played matches

To know how many matches each team played, I will add home_team and away_team columns in a data frame And then group by team names

matches_played = data.frame(team = c(results$home_team,results$away_team)) %>%
  group_by(team) %>%
  summarise(matches = n()) %>%
  arrange(-matches)

With next plot, we can see what teams played the most matches in history

ggplot(data = matches_played[1:10,]) + 
  geom_point(mapping = aes(y = matches, 
                         x = reorder(team,-matches),
                         fill = team),
           stat='identity') +
  labs(title = "Most played matches by team", y = "Matches", x = "Team") +
  theme(legend.position = "none")

Now let’s calculate the percentage of wins of each team To do that, I will merge the matches played data frame with most winning teams

wins_rate = merge(matches_played,most_winning_teams,by="team") %>%
  mutate(wins_rate = wins / matches * 100) %>%
  filter(matches > 100 | team == "Mongolia") %>%
  arrange(-wins_rate)

Now lets see the wins rate

ggplot(data = wins_rate[1:10,]) + 
  geom_bar(mapping = aes(y = wins_rate, 
                         x = reorder(team,-wins_rate),
                         fill = team),
           stat='identity') +
  labs(title = "Win rate", y = "Percentage", x = "Team") +
  theme(legend.position = "none")

Now lets plot both the matches played and the winning rate inside it For doing so, lets convert from wide to long data frame

wins_rate_long = 
  gather(wins_rate, type, value, matches:wins)

Now create a list of teams with best winning rates

top_10_teams = wins_rate_long %>%
  arrange(-wins_rate) %>%
  slice(1:20) %>%
  select(team) %>%
  unique()

Lets add the wins rate and matches played to that list

wins_rate_long_top_10 = merge(wins_rate_long, top_10_teams, by="team") %>%
  arrange(-wins_rate)

Lets substract wins from matches so the columns dont add up

wins_rate_long_top_10$value[wins_rate_long_top_10$type=="matches"] = 
  wins_rate_long_top_10$value[wins_rate_long_top_10$type=="matches"] - wins_rate_long_top_10$value[wins_rate_long_top_10$type=="wins"]

Now lets plot the data and see the best winning rate teams, with columns of wins and matches played stacked up together Along with labels of the winning rates

ggplot(data = wins_rate_long_top_10) + 
  geom_bar(position = "stack",
            mapping = aes(y = value, 
                         x = reorder(team,-wins_rate),
                         fill = type),
           stat='identity') +
  labs(title = "Matches played and won", subtitle="Ordered by best winning percentage", y = "Matches played and won", x = "Team") +
  geom_text(aes(x=team,y=50,
                label=ifelse(type=="wins",paste0(round(wins_rate,0), "%"),"")
                )
            )

We can compare all countries in a single map to know the best ones For doing that we need to merge a map data with the winning rates data frame

world_map <- map_data("world") 
colnames(world_map)[5] <- "team"  

world_map <- merge(world_map,wins_rate,by="team",all = TRUE) %>%
  arrange(order)

ggplot(world_map, aes(x = long, y = lat, group = group, fill=wins_rate)) +
  geom_polygon(colour = "white", show.legend = TRUE) +
  scale_fill_gradient(low="red", high="green")

As we can see, some countries info are missing, like USA or China. This is because the country names in our dataset and the country names in the map differ. We will fix that by replacing the names with the correct ones.

world_map <- map_data("world") 
colnames(world_map)[5] <- "team"  

world_map["team"][world_map["team"] == "China"] <- "China PR"
world_map["team"][world_map["team"] == "USA"] <- "United States"
world_map["team"][world_map["team"] == "Democratic Republic of the Congo"] <- "DR Congo"
world_map["team"][world_map["team"] == "Ireland"] <- "Republic of Ireland"

We also note that UK is considered a single country in the map, but its regions are included in the subregion column. So we will replace team name with subregion instead of country in this case.

world_map["team"][world_map["team"] == "UK"] <- world_map["subregion"][world_map["team"] == "UK"]
# world_map["team"][world_map["team"] == "Great Britain"] <- "England"

Then let’s merge the map with our data set

world_map <- merge(world_map,wins_rate,by="team",all = TRUE) %>%
  arrange(order)

Now we can plot it again and we will see the missing countries now fixed.

# world_map["wins_rate"][world_map["team"] == "Scotland"] = 0

ggplot(world_map, aes(x = long, y = lat, group = group, fill=wins_rate)) +
  geom_polygon(show.legend = TRUE) +
  scale_fill_gradient(low="red", high="green")