The data was acquired and initially cleaned from Git Hub
# Data Import
## Option 1: tidytuesdayR package
tuesdata <- tidytuesdayR::tt_load('2024-09-24')
## ---- Compiling #TidyTuesday Information for 2024-09-24 ----
## --- There are 3 files available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 3: "country_results_df.csv"
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## 2 of 3: "individual_results_df.csv"
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## 3 of 3: "timeline_df.csv"
## OR
tuesdata <- tidytuesdayR::tt_load(2024, week = 39)
## ---- Compiling #TidyTuesday Information for 2024-09-24 ----
## --- There are 3 files available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 3: "country_results_df.csv"
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## 2 of 3: "individual_results_df.csv"
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## 3 of 3: "timeline_df.csv"
country_results_df <- tuesdata$country_results_df
individual_results_df <- tuesdata$individual_results_df
timeline_df <- tuesdata$timeline_df
# Option 2: Read directly from GitHub
country_results_df <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-09-24/country_results_df.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3780 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): country, leader, deputy_leader
## dbl (14): year, team_size_all, team_size_male, team_size_female, p1, p2, p3,...
## lgl (1): p7
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
individual_results_df <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-09-24/individual_results_df.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 21707 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): contestant, country, award
## dbl (9): year, p1, p2, p3, p4, p5, p6, total, individual_rank
## lgl (1): p7
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
timeline_df <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-09-24/timeline_df.csv')
## Rows: 65 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): country, city
## dbl (6): edition, year, countries, all_contestant, male_contestant, female_...
## date (2): start_date, end_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data Cleaning
## Scraping IMO results data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(httr2)
timeline_df <- read_html("https://www.imo-official.org/organizers.aspx") %>%
html_table() %>%
.[[1]] %>%
clean_names() %>%
rename(
"all_contestant" = contestants,
"male_contestant" = contestants_2,
"female_contestant" = contestants_3,
"edition" = number
) %>%
filter(edition != "#") %>%
mutate(
start_date = paste0(gsub("(.*)(-)(.*)", "\\1", date),year),
end_date = paste0(gsub("(.*)(-)(.*)", "\\3", date),year),
across(
c(start_date, end_date),
~as.Date(.x, format = "%d.%m.%Y")
),
across(
c(edition, year, countries, all_contestant, male_contestant, female_contestant),
as.integer
)
) %>%
select(-date) %>%
# only keeping records till current year
filter(year < 2025)
# circulate through country results link and rbind tables
scrape_country <- function(year) {
paste0("https://www.imo-official.org/year_country_r.aspx?year=", year) %>%
read_html() %>%
html_table() %>%
.[[1]] %>%
clean_names() %>%
filter(country != "Country") %>%
mutate(year = year, .before = "country")
}
country_results_df <- map_df(timeline_df$year, scrape_country) %>%
select(
year,
country,
team_size_all = team_size,
team_size_male = team_size_2,
team_size_female = team_size_3,
starts_with("p"),
awards_gold = awards,
awards_silver = awards_2,
awards_bronze = awards_3,
awards_honorable_mentions = awards_4,
leader,
deputy_leader
) %>%
mutate(
across(
c(team_size_all:awards_honorable_mentions),
as.integer
)
)
## circulate through individual results link and rbind tables
scrape_individual <- function(year) {
# These can time out, so we'll use httr2 to retry.
paste0("https://www.imo-official.org/year_individual_r.aspx?year=", year) %>%
httr2::request() %>%
httr2::req_retry(max_tries = 3) %>%
httr2::req_perform() %>%
httr2::resp_body_html() %>%
html_table() %>%
.[[1]] %>%
clean_names() %>%
mutate(year = year, .before = "contestant")
}
individual_results_df <- map_df(timeline_df$year, scrape_individual) %>%
select(
year:p6, p7, total,
individual_rank = number_rank,
award
) %>%
mutate(
across(
c(year, p1:individual_rank),
as.integer
)
)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2)
library(RColorBrewer)
library(dplyr)
country <- country_results_df
individual <- individual_results_df
timeline <- timeline_df
country <- data.table(country)
individual <- data.table(individual)
timeline <- data.table(timeline)
This section provides a showcase of the gold, silver and bronze medals won by different countries in 2024. The vast majority of countries won medals in the 2024 IMO. Some of these countries won mainly gold medals. The vast majority of countries won silver and bronze medals.
country[, awards_total := awards_gold + awards_silver + awards_bronze]
country_2024 <- country[year == 2024]
country_2024_long <- melt(country_2024, id.vars = c("year", "country"),
variable.name = "medal_type", value.name = "count",
measure.vars = c("awards_gold", "awards_silver", "awards_bronze", "awards_total"))
ggplot(country_2024_long, aes(x = country, y = count, fill = medal_type)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~medal_type, scales = "free_y", ncol = 1) +
labs(title = "2024 Awards by Country and Medal Type", x = "Country", y = "Number of Awards") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
This section stacks the genders of the contestants over the years. As a whole, the number of contestants has increased each year, and the number of female contestants has continued to grow. Whereas the number of male contestants has decreased in some years, the overall trend is increasing.
ggplot(timeline, aes(x = year)) +
geom_density(aes(y = male_contestant, fill = "Male"), stat="identity") +
geom_density(aes(y = female_contestant, fill = "Female"), stat="identity", alpha=0.7) +
labs(title = "Gender Ratio of Participants", x = "Year", y = "Entry") +
scale_fill_brewer(palette = "Pastel1") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This section analyses the scores for each topic. Question 7 was not analysed as it was not available in the vast majority of years. Overall the first question in each section (Questions 1 & 4) had the highest average score. The scores decreased as the number of questions increased. The relative performance of the sections was similar. However, for the last question of each section (Question 3 & 6). Some teams will get better performance in these two questions.
# Since the vast majority of the Question 7 data were NA, they were not analysed.
scores_long <- melt(country, measure.vars = patterns("^p[1-6]$"), variable.name = "problem", value.name = "score")
ggplot(scores_long, aes(x = problem, y = score, fill = problem)) +
geom_boxplot() +
labs(title = "IMO Global Average Score Distribution per Question", x = "Question", y = "Score") +
theme_minimal() +
scale_fill_brewer(palette = "Pastel1")
## Warning: Removed 660 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
This section analyses China’s historical performance. China shows a trend towards the concentration of gold medals. That is, almost all team members can win gold medals. Therefore, for silver and bronze medals show a clear downward trend. In many years the whole team won gold medals.
china_data <- country[country == "People's Republic of China"]
china_data_long <- melt(china_data, id.vars = "year", measure.vars = c("awards_gold", "awards_silver", "awards_bronze"),
variable.name = "medal", value.name = "count")
ggplot(china_data_long, aes(x = year, y = count, color = medal)) +
geom_line(size=1) +
geom_point(size=2) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Trends in China's Medal Count", x = "Year", y = "Medal Count") +
scale_color_manual(values = c("awards_gold" = "gold", "awards_silver" = "lightgrey", "awards_bronze" = "darkorange")) +
theme_minimal() +
theme(legend.title = element_blank(), legend.position = "top")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
# Find the Max Awards
individual[, award_value := fifelse(award == "Honourable mention", 1,
fifelse(award == "Bronze medal", 2,
fifelse(award == "Silver medal", 3,
fifelse(award == "Gold medal", 4, NA))))]
individual_max_awards <- individual[, .(max_award = max(award_value, na.rm = TRUE)), by = contestant]
individual_max_awards[, max_award := fifelse(max_award == 1, "Honourable mention",
fifelse(max_award == 2, "Bronze medal",
fifelse(max_award == 3, "Silver medal",
fifelse(max_award == 4, "Gold medal", "No Award"))))]
# Match Leaders' Data
country_leader <- merge(country, individual_max_awards, by.x = "leader", by.y = "contestant", all.x = TRUE)
country_leader <- country_leader[!is.na(country_leader$max_award), ]
names(country_leader)[names(country_leader) == "max_award"] <- "leader_max_award"
# Match Deputy Leaders' Data
country_deputy_leader <- merge(country, individual_max_awards, by.x = "deputy_leader", by.y = "contestant", all.x = TRUE)
country_deputy_leader <- country_deputy_leader[!is.na(country_deputy_leader$max_award), ]
names(country_deputy_leader)[names(country_deputy_leader) == "max_award"] <- "deputy_leader_max_award"
# Leaders' Data Analysis
leader_long <- melt(country_leader, id.vars = c("leader_max_award"),
measure.vars = c("awards_gold", "awards_silver", "awards_bronze", "awards_honorable_mentions"),
variable.name = "Team_Award", value.name = "Count")
leader_long$Team_Award <- factor(leader_long$Team_Award,
levels = c("awards_gold", "awards_silver", "awards_bronze", "awards_honorable_mentions"),
labels = c("Gold", "Silver", "Bronze", "Honorable Mentions"))
leader_long$leader_max_award <- factor(leader_long$leader_max_award,
levels = c("Honourable mention", "Bronze medal", "Silver medal", "Gold medal"),
labels = c("Honourable Mention", "Bronze", "Silver", "Gold"))
# Leaders' Data Visualisation
ggplot(leader_long, aes(x = leader_max_award, y = Count, fill = Team_Award)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Team Awards Distribution by Leader's Max Award Type",
x = "Leader's Maximum Award",
y = "Count of Team Awards") +
scale_fill_manual(values = c("Gold" = "gold",
"Silver" = "lightgrey",
"Bronze" = "darkorange",
"Honorable Mentions" = "purple"), name = "Team Award Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Deputy Leaders' Data Analysis
deputy_leader_long <- melt(country_deputy_leader, id.vars = c("deputy_leader_max_award"),
measure.vars = c("awards_gold", "awards_silver", "awards_bronze", "awards_honorable_mentions"),
variable.name = "Team_Award", value.name = "Count")
deputy_leader_long$Team_Award <- factor(deputy_leader_long$Team_Award,
levels = c("awards_gold", "awards_silver", "awards_bronze", "awards_honorable_mentions"),
labels = c("Gold", "Silver", "Bronze", "Honorable Mentions"))
deputy_leader_long$deputy_leader_max_award <- factor(deputy_leader_long$deputy_leader_max_award,
levels = c("Honourable mention", "Bronze medal", "Silver medal", "Gold medal"),
labels = c("Honourable Mention", "Bronze", "Silver", "Gold"))
# Deputy Leaders' Data Visualisation
ggplot(deputy_leader_long, aes(x = deputy_leader_max_award, y = Count, fill = Team_Award)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Team Awards Distribution by Deputy Leader's Max Award Type",
x = "Deputy Leader's Maximum Award",
y = "Count of Team Awards") +
scale_fill_manual(values = c("Gold" = "gold",
"Silver" = "lightgrey",
"Bronze" = "darkorange",
"Honorable Mentions" = "purple"), name = "Team Award Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).