This is my final project for my DV2 course. In this project I use the tidy tuesday olympics medals data set (https://github.com/rfordatascience/tidytuesday/tree/master/data/2021/2021-07-27) from the readme these are the variables:
| variable | class | description |
|---|---|---|
| id | double | Athlete ID |
| name | character | Athlete Name |
| sex | character | Athlete Sex |
| age | double | Athlete Age |
| height | double | Athlete Height in cm |
| weight | double | Athlete weight in kg |
| team | character | Country/Team competing for |
| noc | character | noc region |
| games | character | Olympic games name |
| year | double | Year of olympics |
| season | character | Season either winter or summer |
| city | character | City of Olympic host |
| sport | character | Sport |
| event | character | Specific event |
| medal | character | Medal (Gold, Silver, Bronze or NA) |
The questions I want to answer:
Where were the Summer Olympics? Which were the countries who won at least 1 gold medal in 10 Olympics? Is the Age distribution of Olympic medalists constant over time? Is it true that if a country is more populous it wins more medals?
library(knitr)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(janitor)
library(data.table)
library(tidygeocoder)
library(gganimate)
library(scatterpie)
library(maps)
olympics <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv")
# cleaning suggested by the author
olympics <- olympics %>% janitor::clean_names()
olympics <- as.data.table(olympics)
# filtering for medals, and summer olympics
olympics <- olympics[season == "Summer"]
olympicsgold <- olympics[medal == "Gold"][, gold := 1]
olympicssilver <- olympics[medal == "Silver"][, silver := 1]
olympicsbronze <- olympics[medal == "Bronze"][, bronze := 1]
olympics <- rbind(olympicsgold, olympicssilver, olympicsbronze, fill = TRUE)
Where were the Summer Olympics?
As we can see most of the Olympics took place on Europe (11), then the North America (5), Asia(3), Australia (2) and South America (1). These numbers only show the Hosting cities if a city hosted it multiple times it still counts as one.
hosting_city <- data.table(unique(olympics[, c("city", "year")], by = "city"))
hosting_city <- tidygeocoder::geocode(hosting_city, "city")
world <- map_data("world")
ggplot() +
geom_map(
data = world, map = world,
aes(long, lat, map_id = region),
color = "darkorange", fill = "lightgray", size = 0.1
) +
geom_point(
data = hosting_city,
aes(long, lat, color = city),
size = 3,
alpha = 0.5
) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Summer Olympics Location")
Which were the countries who won atleast 1 gold medal in 10 Olympics? The data needed to be filtered so in a multy person sport only one medal is counted. Here are the countries. Some interesting features. Hungary is in the list. Some countries collapsed for example Yugoslavia, but the USSR is not there as they did not participate in 10 Olympics.
olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)
gold_table <- olympics_country[, list(sumgold = sum(gold, na.rm = TRUE)), by = list(year, noc)]
gold_table_count <- gold_table[, .N, by = noc]
gold_table_count <- merge(gold_table, gold_table_count)[N > 10]
ggplot(gold_table_count, aes(y = sumgold, x = noc)) +
geom_bar(position = "dodge", stat = "identity", fill = "darkorange", color = "darkorange") +
transition_states(year) +
labs(
title = paste("Year {closest_state}")
) +
ylab("Gold Medal Count") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
theme(panel.grid.major = element_line(colour = "darkblue"),
panel.grid.minor = element_line(colour = "darkblue", size = 0.25))
Is the Age distribution of Olympic medalists constant over time?
The boxplot is constant in the end it has breaks (World Wars), after these the values are usually higher. There are high values even greater than 60. I did not know how that could happen, but it turns out until 1948 there were art categories in the Olympics, and after 48 we can see the drop off the 65+ old people.
ggplot(olympics, aes(x = year, y = age, group = year)) +
geom_boxplot(color = "darkorange") +
theme_bw() +
ylab("Athlete Ages") +
xlab("Year") +
theme(panel.grid.major = element_line(colour = "darkblue"),
panel.grid.minor = element_line(colour = "darkblue", size = 0.25))
#olympics1 <- olympics
#head(arrange(olympics1[,event:=NULL],desc(age)), n = 10) %>% kable()
Is it true that if a country is more popolous it wins more medals?
I made a scatterpie with log population and log count of all medals and the pies show the ratio of the medals. It wasnt very useful as I could not label them.
olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)
countr_codes <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/country-codes.csv"))
country_mapping <- countr_codes[, list(ISO3, ISO3166, official_name_en)][, ISO3166 := str_remove(ISO3166, "^0+")]
pop_data <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/pop-data.csv"))
pop_data <- pop_data[Time == 2019, list(LocID, PopTotal)][, LocID := as.character(LocID)]
pop_count_code <- merge(pop_data, country_mapping, by.x = "LocID", by.y = "ISO3166")
olympics_pop <- merge(olympics_country, pop_count_code, by.x = "noc", by.y = "ISO3")
dt <- unique(olympics_pop[, list(
sumall = log(sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
+ sum(silver, na.rm = TRUE)),
Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
Bronze = sum(bronze, na.rm = TRUE), PopTotal = log(PopTotal), official_name_en
), by = "noc"])
ggplot() +
geom_scatterpie(aes(x = PopTotal, y = sumall, group = noc),
data = dt, alpha = 0.8,
cols = c("Gold", "Silver", "Bronze")
) +
theme_bw() +
xlab("Log Total Population in Thousands") +
ylab("Log Count of All Medals") +
theme(panel.grid.major = element_line(colour = "darkblue"),
panel.grid.minor = element_line(colour = "darkblue", size = 0.25)) +
coord_equal()
So I plotted them on a map. I needed to remove the countries where the geocoding did not result in meaningful coordinates e.g. Yugoslavia.The pies are weighted with the count of all the medals the country won.
dt2 <- unique(olympics_pop[, list(
sumall = (sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
+ sum(silver, na.rm = TRUE)),
Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
Bronze = sum(bronze, na.rm = TRUE), PopTotal = (PopTotal), official_name_en
), by = "noc"])
dt2 <- as.data.table(tidygeocoder::geocode(dt, "official_name_en"))
dt2 <- dt2[!is.na(long)]
dt2 <- dt2[, sumall := log(Gold + Silver + Bronze) / 2]
world <- map_data("world")
p <- ggplot(world, aes(long, lat)) +
geom_map(map = world, aes(map_id = region), fill = NA, color = "darkorange") +
coord_quickmap() +
theme_bw() +
xlab("") +
ylab("")
p + geom_scatterpie(aes(x = long, y = lat, group = noc, r = sumall),
data = dt2, cols = c("Gold", "Silver", "Bronze"), color = NA, alpha = 0.8)