Introduction

This is my final project for my DV2 course. In this project I use the tidy tuesday olympics medals data set (https://github.com/rfordatascience/tidytuesday/tree/master/data/2021/2021-07-27) from the readme these are the variables:

variable class description
id double Athlete ID
name character Athlete Name
sex character Athlete Sex
age double Athlete Age
height double Athlete Height in cm
weight double Athlete weight in kg
team character Country/Team competing for
noc character noc region
games character Olympic games name
year double Year of olympics
season character Season either winter or summer
city character City of Olympic host
sport character Sport
event character Specific event
medal character Medal (Gold, Silver, Bronze or NA)

The questions I want to answer:

Where were the Summer Olympics? Which were the countries who won at least 1 gold medal in 10 Olympics? Is the Age distribution of Olympic medalists constant over time? Is it true that if a country is more populous it wins more medals?

library(knitr)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(janitor)
library(data.table)
library(tidygeocoder)
library(gganimate)
library(scatterpie)
library(maps)


olympics <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv")

# cleaning suggested by the author
olympics <- olympics %>% janitor::clean_names()

olympics <- as.data.table(olympics)
# filtering for medals, and summer olympics
olympics <- olympics[season == "Summer"]
olympicsgold <- olympics[medal == "Gold"][, gold := 1]
olympicssilver <- olympics[medal == "Silver"][, silver := 1]
olympicsbronze <- olympics[medal == "Bronze"][, bronze := 1]

olympics <- rbind(olympicsgold, olympicssilver, olympicsbronze, fill = TRUE)

Where were the Summer Olympics?

As we can see most of the Olympics took place on Europe (11), then the North America (5), Asia(3), Australia (2) and South America (1). These numbers only show the Hosting cities if a city hosted it multiple times it still counts as one.

hosting_city <- data.table(unique(olympics[, c("city", "year")], by = "city"))
hosting_city <- tidygeocoder::geocode(hosting_city, "city")
world <- map_data("world")
ggplot() +
  geom_map(
    data = world, map = world,
    aes(long, lat, map_id = region),
    color = "darkorange", fill = "lightgray", size = 0.1
  ) +
  geom_point(
    data = hosting_city,
    aes(long, lat, color = city),
    size = 3,
    alpha = 0.5
  ) +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(title = "Summer Olympics Location")

Which were the countries who won atleast 1 gold medal in 10 Olympics? The data needed to be filtered so in a multy person sport only one medal is counted. Here are the countries. Some interesting features. Hungary is in the list. Some countries collapsed for example Yugoslavia, but the USSR is not there as they did not participate in 10 Olympics.

olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)
gold_table <- olympics_country[, list(sumgold = sum(gold, na.rm = TRUE)), by = list(year, noc)]
gold_table_count <- gold_table[, .N, by = noc]
gold_table_count <- merge(gold_table, gold_table_count)[N > 10]
ggplot(gold_table_count, aes(y = sumgold, x = noc)) +
  geom_bar(position = "dodge", stat = "identity", fill = "darkorange", color = "darkorange") +
  transition_states(year) +
  labs(
    title = paste("Year {closest_state}")
  ) + 
  ylab("Gold Medal Count") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  theme(panel.grid.major = element_line(colour = "darkblue"),
  panel.grid.minor = element_line(colour = "darkblue", size = 0.25)) 

Is the Age distribution of Olympic medalists constant over time?

The boxplot is constant in the end it has breaks (World Wars), after these the values are usually higher. There are high values even greater than 60. I did not know how that could happen, but it turns out until 1948 there were art categories in the Olympics, and after 48 we can see the drop off the 65+ old people.

ggplot(olympics, aes(x = year, y = age, group = year)) +
  geom_boxplot(color = "darkorange") +
  theme_bw()  +
  ylab("Athlete Ages") +
  xlab("Year") +
  theme(panel.grid.major = element_line(colour = "darkblue"),
  panel.grid.minor = element_line(colour = "darkblue", size = 0.25))

#olympics1 <- olympics
#head(arrange(olympics1[,event:=NULL],desc(age)), n = 10) %>% kable()

Is it true that if a country is more popolous it wins more medals?

I made a scatterpie with log population and log count of all medals and the pies show the ratio of the medals. It wasnt very useful as I could not label them.

olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)

countr_codes <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/country-codes.csv"))

country_mapping <- countr_codes[, list(ISO3, ISO3166, official_name_en)][, ISO3166 := str_remove(ISO3166, "^0+")]

pop_data <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/pop-data.csv"))
pop_data <- pop_data[Time == 2019, list(LocID, PopTotal)][, LocID := as.character(LocID)]

pop_count_code <- merge(pop_data, country_mapping, by.x = "LocID", by.y = "ISO3166")

olympics_pop <- merge(olympics_country, pop_count_code, by.x = "noc", by.y = "ISO3")
dt <- unique(olympics_pop[, list(
  sumall = log(sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
    + sum(silver, na.rm = TRUE)),
  Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
  Bronze = sum(bronze, na.rm = TRUE), PopTotal = log(PopTotal), official_name_en
), by = "noc"])

ggplot() +
  geom_scatterpie(aes(x = PopTotal, y = sumall, group = noc),
    data = dt, alpha = 0.8,
    cols = c("Gold", "Silver", "Bronze")
  ) +
  theme_bw() +
  xlab("Log Total Population in Thousands") + 
  ylab("Log Count of All Medals") +
  theme(panel.grid.major = element_line(colour = "darkblue"),
  panel.grid.minor = element_line(colour = "darkblue", size = 0.25)) +
  coord_equal()

So I plotted them on a map. I needed to remove the countries where the geocoding did not result in meaningful coordinates e.g. Yugoslavia.The pies are weighted with the count of all the medals the country won.

dt2 <- unique(olympics_pop[, list(
  sumall = (sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
    + sum(silver, na.rm = TRUE)),
  Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
  Bronze = sum(bronze, na.rm = TRUE), PopTotal = (PopTotal), official_name_en
), by = "noc"])

dt2 <- as.data.table(tidygeocoder::geocode(dt, "official_name_en"))
dt2 <- dt2[!is.na(long)]
dt2 <- dt2[, sumall := log(Gold + Silver + Bronze) / 2]

world <- map_data("world")
p <- ggplot(world, aes(long, lat)) +
  geom_map(map = world, aes(map_id = region), fill = NA, color = "darkorange") +
  coord_quickmap() +
  theme_bw() +
  xlab("") + 
  ylab("")
p + geom_scatterpie(aes(x = long, y = lat, group = noc, r = sumall),
  data = dt2, cols = c("Gold", "Silver", "Bronze"), color = NA, alpha = 0.8)