library(knitr)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(janitor)
library(data.table)
library(tidygeocoder)
library(gganimate)
library(scatterpie)
library(maps)

Do a bit more EDA at the beginning, do some basic histograms. Have a conclusion

1 Introduction

This is my final project for my DV2 course. In this project I use the tidy tuesday olympics medals data set (https://github.com/rfordatascience/tidytuesday/tree/master/data/2021/2021-07-27) from the readme these are the variables:

variables <- readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/variables.csv")
variables %>% kable()
variable class description
id double Athlete ID
name character Athlete Name
sex character Athlete Sex
age double Athlete Age
height double Athlete Height in cm
weight double Athlete weight in kg
team character Country/Team competing for
noc character noc region
games character Olympic games name
year double Year of olympics
season character Season either winter or summer
city character City of Olympic host
sport character Sport
event character Specific event
medal character Medal (Gold
olympics <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv")

# cleaning suggested by the author
olympics <- olympics %>% janitor::clean_names()

olympics <- as.data.table(olympics)
#Custom theme
theme_custom <- function(){ 
    font <- "Georgia"   #assign font family up front
    
    theme_bw() %+replace%    #replace elements we want to change
    
    theme(
      panel.grid.major = element_line(colour = "darkblue"),
      panel.grid.minor = element_line(colour = "darkblue", size = 0.25)
    )
}
# filtering for medals, and summer olympics
olympics <- olympics[season == "Summer"]
olympicsgold <- olympics[medal == "Gold"][, gold := 1]
olympicssilver <- olympics[medal == "Silver"][, silver := 1]
olympicsbronze <- olympics[medal == "Bronze"][, bronze := 1]

olympics <- rbind(olympicsgold, olympicssilver, olympicsbronze, fill = TRUE)

2 EDA

ggplot(olympics, aes(sex)) +
  geom_bar(fill = "darkorange") + 
  theme_custom()

ggplot(olympics, aes(age)) +
  geom_histogram(fill = "darkorange") + 
  theme_custom()

ggplot(olympics, aes(weight)) +
  geom_histogram(fill = "darkorange") + 
  theme_custom()

ggplot(olympics, aes(height)) +
  geom_histogram(fill = "darkorange") + 
  theme_custom()

ggplot(olympics, aes(sport)) +
  geom_bar(fill = "darkorange") + 
  theme_custom() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

ggplot(olympics, aes(year)) +
  geom_bar() + 
  theme_custom() +
  scale_x_continuous(breaks=seq(1896,2020,4)) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

3 Questions

The questions I want to answer:

Where were the Summer Olympics? Which were the countries who won at least 1 gold medal in 10 Olympics? Is the Age distribution of Olympic medalists constant over time? Is it true that if a country is more populous it wins more medals?

4 Where were the Summer Olympics?

As we can see most of the Olympics took place on Europe (11), then the North America (5), Asia(3), Australia (2) and South America (1). These numbers only show the Hosting cities if a city hosted it multiple times it still counts as one.

hosting_city <- data.table(unique(olympics[, c("city", "year")], by = "city"))
hosting_city <- tidygeocoder::geocode(hosting_city, "city")
world <- map_data("world")
ggplot() +
  geom_map(
    data = world, map = world,
    aes(long, lat, map_id = region),
    fill = "lightgray", size = 0.1
  ) +
  geom_point(
    data = hosting_city,
    aes(long, lat, color = city),
    size = 3,
    alpha = 0.5
  ) +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(title = "Summer Olympics Location")

5 Which were the countries who won atleast 1 gold medal in 10 Olympics?

The data needed to be filtered so in a multy person sport only one medal is counted. Here are the countries. Some interesting features. Hungary is in the list. Some countries collapsed for example Yugoslavia, but the USSR is not there as they did not participate in 10 Olympics.

olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)
gold_table <- olympics_country[, list(sumgold = sum(gold, na.rm = TRUE)), by = list(year, noc)]
gold_table_count <- gold_table[, .N, by = noc]
gold_table_count <- merge(gold_table, gold_table_count)[N > 10]
ggplot(gold_table_count, aes(y = sumgold, x = noc)) +
  geom_bar(position = "dodge", stat = "identity", fill = "darkorange", color = "darkorange") +
  transition_states(year) +
  labs(
    title = paste("Year {closest_state}")
  ) + 
  xlab("National Olympic Committee") +
  ylab("Gold Medal Count") +
  theme_custom() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# Is the Age distribution of Olympic medalists constant over time?

The boxplot is constant in the end it has breaks (World Wars), after these the values are usually higher. There are high values even greater than 60. I did not know how that could happen, but it turns out until 1948 there were art categories in the Olympics, and after 48 we can see the drop off the 65+ old people.

ggplot(olympics, aes(x = year, y = age, group = year)) +
  geom_boxplot(color = "darkorange") +
  theme_custom()  +
  ylab("Athlete Ages") +
  xlab("Year") +
  theme(panel.grid.major = element_line(colour = "darkblue"),
  panel.grid.minor = element_line(colour = "darkblue", size = 0.25))

olympics1 <- copy(olympics)
head(arrange(olympics1[,event:=NULL],desc(age)), n = 10) %>% kable()
id name sex age height weight team noc games year season city sport medal gold silver bronze
22984 John (Herbert Crawford-) Copley (Williamson-) M 73 NA NA Great Britain GBR 1948 Summer 1948 Summer London Art Competitions Silver NA 1 NA
117046 Oscar Gomer Swahn M 72 NA NA Sweden SWE 1920 Summer 1920 Summer Antwerpen Shooting Silver NA 1 NA
30731 Jozu Dupon M 72 NA NA Belgium BEL 1936 Summer 1936 Summer Berlin Art Competitions Bronze NA NA 1
75648 Charles William Martin M 71 NA NA Crabe II-1 FRA 1900 Summer 1900 Summer Paris Sailing Silver NA 1 NA
75648 Charles William Martin M 71 NA NA Crabe II-4 FRA 1900 Summer 1900 Summer Paris Sailing Bronze NA NA 1
119650 Oskar Thiede M 69 NA NA Austria AUT 1948 Summer 1948 Summer London Art Competitions Silver NA 1 NA
45286 Letitia Marion Hamilton F 69 NA NA Ireland IRL 1948 Summer 1948 Summer London Art Competitions Bronze NA NA 1
30932 Samuel Harding Duvall M 68 NA NA Cincinnati Archers USA 1904 Summer 1904 Summer St. Louis Archery Silver NA 1 NA
73120 Frederick William MacMonnies M 68 NA NA United States USA 1932 Summer 1932 Summer Los Angeles Art Competitions Silver NA 1 NA
87135 Louis Noverraz M 66 179 78 Switzerland SUI 1968 Summer 1968 Summer Mexico City Sailing Silver NA 1 NA

6 Is it true that if a country is more popolous it wins more medals?

I made a scatterpie with log population and log count of all medals and the pies show the ratio of the medals. It wasnt very useful as I could not label them.

olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)

countr_codes <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/country-codes.csv"))

country_mapping <- countr_codes[, list(ISO3, ISO3166, official_name_en)][, ISO3166 := str_remove(ISO3166, "^0+")]

pop_data <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/pop-data.csv"))
pop_data <- pop_data[Time == 2019, list(LocID, PopTotal)][, LocID := as.character(LocID)]

pop_count_code <- merge(pop_data, country_mapping, by.x = "LocID", by.y = "ISO3166")

olympics_pop <- merge(olympics_country, pop_count_code, by.x = "noc", by.y = "ISO3")
dt <- unique(olympics_pop[, list(
  sumall = log(sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
    + sum(silver, na.rm = TRUE)),
  Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
  Bronze = sum(bronze, na.rm = TRUE), PopTotal = log(PopTotal), official_name_en
), by = "noc"])

ggplot() +
  geom_scatterpie(aes(x = PopTotal, y = sumall, group = noc),
    data = dt, alpha = 0.8,
    cols = c("Gold", "Silver", "Bronze")
  ) +
  theme_custom() +
  xlab("Log Total Population in Thousands") + 
  ylab("Log Count of All Medals") +
  coord_equal()

So I plotted them on a map. I needed to remove the countries where the geocoding did not result in meaningful coordinates e.g. Yugoslavia.The pies are weighted with the count of all the medals the country won.

dt2 <- unique(olympics_pop[, list(
  sumall = (sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
    + sum(silver, na.rm = TRUE)),
  Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
  Bronze = sum(bronze, na.rm = TRUE), PopTotal = (PopTotal), official_name_en
), by = "noc"])

dt2 <- as.data.table(tidygeocoder::geocode(dt, "official_name_en"))
dt2 <- dt2[!is.na(long)]
dt2 <- dt2[, sumall := log(Gold + Silver + Bronze) / 2]

world <- map_data("world")
p <- ggplot(world, aes(long, lat)) +
  geom_map(map = world, aes(map_id = region), fill = NA, color = "darkorange") +
  coord_quickmap() +
  theme_custom() +
  xlab("Longitude") + 
  ylab("Latitude")
p + geom_scatterpie(aes(x = long, y = lat, group = noc, r = sumall),
  data = dt2, cols = c("Gold", "Silver", "Bronze"), color = NA, alpha = 0.8)

7 Conclusion

In conclusion:

Finally this would be fun to see for the Winter Olympics.