library(knitr)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)library(tidyverse)
library(janitor)
library(data.table)
library(tidygeocoder)
library(gganimate)
library(scatterpie)
library(maps)Do a bit more EDA at the beginning, do some basic histograms. Have a conclusion
This is my final project for my DV2 course. In this project I use the tidy tuesday olympics medals data set (https://github.com/rfordatascience/tidytuesday/tree/master/data/2021/2021-07-27) from the readme these are the variables:
variables <- readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/variables.csv")
variables %>% kable()| variable | class | description |
|---|---|---|
| id | double | Athlete ID |
| name | character | Athlete Name |
| sex | character | Athlete Sex |
| age | double | Athlete Age |
| height | double | Athlete Height in cm |
| weight | double | Athlete weight in kg |
| team | character | Country/Team competing for |
| noc | character | noc region |
| games | character | Olympic games name |
| year | double | Year of olympics |
| season | character | Season either winter or summer |
| city | character | City of Olympic host |
| sport | character | Sport |
| event | character | Specific event |
| medal | character | Medal (Gold |
olympics <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv")
# cleaning suggested by the author
olympics <- olympics %>% janitor::clean_names()
olympics <- as.data.table(olympics)#Custom theme
theme_custom <- function(){
font <- "Georgia" #assign font family up front
theme_bw() %+replace% #replace elements we want to change
theme(
panel.grid.major = element_line(colour = "darkblue"),
panel.grid.minor = element_line(colour = "darkblue", size = 0.25)
)
}# filtering for medals, and summer olympics
olympics <- olympics[season == "Summer"]
olympicsgold <- olympics[medal == "Gold"][, gold := 1]
olympicssilver <- olympics[medal == "Silver"][, silver := 1]
olympicsbronze <- olympics[medal == "Bronze"][, bronze := 1]
olympics <- rbind(olympicsgold, olympicssilver, olympicsbronze, fill = TRUE)ggplot(olympics, aes(sex)) +
geom_bar(fill = "darkorange") +
theme_custom()ggplot(olympics, aes(age)) +
geom_histogram(fill = "darkorange") +
theme_custom()ggplot(olympics, aes(weight)) +
geom_histogram(fill = "darkorange") +
theme_custom()ggplot(olympics, aes(height)) +
geom_histogram(fill = "darkorange") +
theme_custom()ggplot(olympics, aes(sport)) +
geom_bar(fill = "darkorange") +
theme_custom() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))ggplot(olympics, aes(year)) +
geom_bar() +
theme_custom() +
scale_x_continuous(breaks=seq(1896,2020,4)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))The questions I want to answer:
Where were the Summer Olympics? Which were the countries who won at least 1 gold medal in 10 Olympics? Is the Age distribution of Olympic medalists constant over time? Is it true that if a country is more populous it wins more medals?
As we can see most of the Olympics took place on Europe (11), then the North America (5), Asia(3), Australia (2) and South America (1). These numbers only show the Hosting cities if a city hosted it multiple times it still counts as one.
hosting_city <- data.table(unique(olympics[, c("city", "year")], by = "city"))
hosting_city <- tidygeocoder::geocode(hosting_city, "city")
world <- map_data("world")
ggplot() +
geom_map(
data = world, map = world,
aes(long, lat, map_id = region),
fill = "lightgray", size = 0.1
) +
geom_point(
data = hosting_city,
aes(long, lat, color = city),
size = 3,
alpha = 0.5
) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Summer Olympics Location")The data needed to be filtered so in a multy person sport only one medal is counted. Here are the countries. Some interesting features. Hungary is in the list. Some countries collapsed for example Yugoslavia, but the USSR is not there as they did not participate in 10 Olympics.
olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)
gold_table <- olympics_country[, list(sumgold = sum(gold, na.rm = TRUE)), by = list(year, noc)]
gold_table_count <- gold_table[, .N, by = noc]
gold_table_count <- merge(gold_table, gold_table_count)[N > 10]
ggplot(gold_table_count, aes(y = sumgold, x = noc)) +
geom_bar(position = "dodge", stat = "identity", fill = "darkorange", color = "darkorange") +
transition_states(year) +
labs(
title = paste("Year {closest_state}")
) +
xlab("National Olympic Committee") +
ylab("Gold Medal Count") +
theme_custom() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) # Is the Age distribution of Olympic medalists constant over time?
The boxplot is constant in the end it has breaks (World Wars), after these the values are usually higher. There are high values even greater than 60. I did not know how that could happen, but it turns out until 1948 there were art categories in the Olympics, and after 48 we can see the drop off the 65+ old people.
ggplot(olympics, aes(x = year, y = age, group = year)) +
geom_boxplot(color = "darkorange") +
theme_custom() +
ylab("Athlete Ages") +
xlab("Year") +
theme(panel.grid.major = element_line(colour = "darkblue"),
panel.grid.minor = element_line(colour = "darkblue", size = 0.25))olympics1 <- copy(olympics)
head(arrange(olympics1[,event:=NULL],desc(age)), n = 10) %>% kable()| id | name | sex | age | height | weight | team | noc | games | year | season | city | sport | medal | gold | silver | bronze |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22984 | John (Herbert Crawford-) Copley (Williamson-) | M | 73 | NA | NA | Great Britain | GBR | 1948 Summer | 1948 | Summer | London | Art Competitions | Silver | NA | 1 | NA |
| 117046 | Oscar Gomer Swahn | M | 72 | NA | NA | Sweden | SWE | 1920 Summer | 1920 | Summer | Antwerpen | Shooting | Silver | NA | 1 | NA |
| 30731 | Jozu Dupon | M | 72 | NA | NA | Belgium | BEL | 1936 Summer | 1936 | Summer | Berlin | Art Competitions | Bronze | NA | NA | 1 |
| 75648 | Charles William Martin | M | 71 | NA | NA | Crabe II-1 | FRA | 1900 Summer | 1900 | Summer | Paris | Sailing | Silver | NA | 1 | NA |
| 75648 | Charles William Martin | M | 71 | NA | NA | Crabe II-4 | FRA | 1900 Summer | 1900 | Summer | Paris | Sailing | Bronze | NA | NA | 1 |
| 119650 | Oskar Thiede | M | 69 | NA | NA | Austria | AUT | 1948 Summer | 1948 | Summer | London | Art Competitions | Silver | NA | 1 | NA |
| 45286 | Letitia Marion Hamilton | F | 69 | NA | NA | Ireland | IRL | 1948 Summer | 1948 | Summer | London | Art Competitions | Bronze | NA | NA | 1 |
| 30932 | Samuel Harding Duvall | M | 68 | NA | NA | Cincinnati Archers | USA | 1904 Summer | 1904 | Summer | St. Louis | Archery | Silver | NA | 1 | NA |
| 73120 | Frederick William MacMonnies | M | 68 | NA | NA | United States | USA | 1932 Summer | 1932 | Summer | Los Angeles | Art Competitions | Silver | NA | 1 | NA |
| 87135 | Louis Noverraz | M | 66 | 179 | 78 | Switzerland | SUI | 1968 Summer | 1968 | Summer | Mexico City | Sailing | Silver | NA | 1 | NA |
I made a scatterpie with log population and log count of all medals and the pies show the ratio of the medals. It wasnt very useful as I could not label them.
olympics_country <- distinct(olympics, noc, year, medal, event, .keep_all = TRUE)
countr_codes <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/country-codes.csv"))
country_mapping <- countr_codes[, list(ISO3, ISO3166, official_name_en)][, ISO3166 := str_remove(ISO3166, "^0+")]
pop_data <- as.data.table(readr::read_csv("https://raw.githubusercontent.com/kanyipi/CEU-DV2/2021-2022/finalproject/pop-data.csv"))
pop_data <- pop_data[Time == 2019, list(LocID, PopTotal)][, LocID := as.character(LocID)]
pop_count_code <- merge(pop_data, country_mapping, by.x = "LocID", by.y = "ISO3166")
olympics_pop <- merge(olympics_country, pop_count_code, by.x = "noc", by.y = "ISO3")
dt <- unique(olympics_pop[, list(
sumall = log(sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
+ sum(silver, na.rm = TRUE)),
Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
Bronze = sum(bronze, na.rm = TRUE), PopTotal = log(PopTotal), official_name_en
), by = "noc"])
ggplot() +
geom_scatterpie(aes(x = PopTotal, y = sumall, group = noc),
data = dt, alpha = 0.8,
cols = c("Gold", "Silver", "Bronze")
) +
theme_custom() +
xlab("Log Total Population in Thousands") +
ylab("Log Count of All Medals") +
coord_equal()So I plotted them on a map. I needed to remove the countries where the geocoding did not result in meaningful coordinates e.g. Yugoslavia.The pies are weighted with the count of all the medals the country won.
dt2 <- unique(olympics_pop[, list(
sumall = (sum(gold, na.rm = TRUE) + sum(bronze, na.rm = TRUE)
+ sum(silver, na.rm = TRUE)),
Gold = sum(gold, na.rm = TRUE), Silver = sum(silver, na.rm = TRUE),
Bronze = sum(bronze, na.rm = TRUE), PopTotal = (PopTotal), official_name_en
), by = "noc"])
dt2 <- as.data.table(tidygeocoder::geocode(dt, "official_name_en"))
dt2 <- dt2[!is.na(long)]
dt2 <- dt2[, sumall := log(Gold + Silver + Bronze) / 2]
world <- map_data("world")
p <- ggplot(world, aes(long, lat)) +
geom_map(map = world, aes(map_id = region), fill = NA, color = "darkorange") +
coord_quickmap() +
theme_custom() +
xlab("Longitude") +
ylab("Latitude")
p + geom_scatterpie(aes(x = long, y = lat, group = noc, r = sumall),
data = dt2, cols = c("Gold", "Silver", "Bronze"), color = NA, alpha = 0.8)In conclusion:
Finally this would be fun to see for the Winter Olympics.