## Load this libraries in order to run the codes. If needed install.
library(data.table)
library(ggplot2)
library(knitr)
library(kableExtra)
library(dplyr)
library(tidyr)
library(RColorBrewer)
## the following codes load the first data set into R
country_results_df <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-09-24/country_results_df.csv', show_col_types = FALSE)
## see the structure of the data
str(country_results_df)
## spc_tbl_ [3,780 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ year : num [1:3780] 2024 2024 2024 2024 2024 ...
## $ country : chr [1:3780] "United States of America" "People's Republic of China" "Republic of Korea" "India" ...
## $ team_size_all : num [1:3780] 6 6 6 6 6 6 6 6 6 6 ...
## $ team_size_male : num [1:3780] 5 6 6 6 6 6 6 6 6 5 ...
## $ team_size_female : num [1:3780] 1 0 0 0 0 0 0 0 0 1 ...
## $ p1 : num [1:3780] 42 42 42 42 42 42 42 42 42 38 ...
## $ p2 : num [1:3780] 41 42 37 34 30 37 33 37 25 37 ...
## $ p3 : num [1:3780] 19 31 18 11 10 7 8 16 5 5 ...
## $ p4 : num [1:3780] 40 40 42 42 42 42 42 36 42 42 ...
## $ p5 : num [1:3780] 35 22 7 28 36 29 31 23 35 12 ...
## $ p6 : num [1:3780] 15 13 22 10 5 5 6 1 2 17 ...
## $ p7 : logi [1:3780] NA NA NA NA NA NA ...
## $ awards_gold : num [1:3780] 5 5 2 4 4 1 2 2 1 2 ...
## $ awards_silver : num [1:3780] 1 1 4 1 0 5 3 3 4 2 ...
## $ awards_bronze : num [1:3780] 0 0 0 0 2 0 1 1 1 2 ...
## $ awards_honorable_mentions: num [1:3780] 0 0 0 1 0 0 0 0 0 0 ...
## $ leader : chr [1:3780] "John Berman" "Liang Xiao" "Suyoung Choi" "Krishnan Sivasubramanian" ...
## $ deputy_leader : chr [1:3780] "Carl Schildkraut" "Yijun Yao" "Hwajong Yoo" "Rijul Saini" ...
## - attr(*, "spec")=
## .. cols(
## .. year = col_double(),
## .. country = col_character(),
## .. team_size_all = col_double(),
## .. team_size_male = col_double(),
## .. team_size_female = col_double(),
## .. p1 = col_double(),
## .. p2 = col_double(),
## .. p3 = col_double(),
## .. p4 = col_double(),
## .. p5 = col_double(),
## .. p6 = col_double(),
## .. p7 = col_logical(),
## .. awards_gold = col_double(),
## .. awards_silver = col_double(),
## .. awards_bronze = col_double(),
## .. awards_honorable_mentions = col_double(),
## .. leader = col_character(),
## .. deputy_leader = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Comment: The country_results_df dataset provides a detailed view of the International Mathematical Olympiad (IMO), highlighting team compositions, individual scores, and awards across countries and years.
## Convert to data.table
dt <- as.data.table(country_results_df)
## Total gold awards by year
plot1 <- ggplot(awards_by_year, aes(x = year, y = total_gold)) +
geom_line(color = "#2c7fb8", linewidth = 1) +
labs(title = "Total Gold Awards by Year", x = "Year", y = "Gold Awards") +
theme_minimal()
plot1
The plot of total gold awards by year shows the trend of gold medals awarded at the International Mathematical Olympiad over time
## Now, let's see Top 5 countries in gold award section
## Summarizing total gold awards by country
top_countries_gold <- dt[, .(total_gold = sum(awards_gold, na.rm = TRUE)), by = country]
## Selecting the top 5 countries with the most gold awards
top_5_countries_gold <- top_countries_gold[order(-total_gold)][1:5]
## Filtering the dataset for these top 5 countries
dt_top_5_countries <- dt[country %in% top_5_countries_gold$country]
## Plot: Gold Awards by Top 5 Countries
plot_gold <- ggplot(top_5_countries_gold, aes(x = reorder(country, -total_gold), y = total_gold, fill = country)) +
geom_col(show.legend = FALSE) +
labs(
title = "Top 5 Countries by Total Gold Awards",
x = "Country",
y = "Total Gold Awards"
) +
theme_minimal() +
scale_fill_brewer(palette = "Set3") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_gold
The plot of the Top 5 Countries by Total Gold Awards visualizes the distribution of gold medals among the top-performing countries. And we can see that People’s Republic of China is the first one.
## Now, lets see top silver awards by country, but in horizontal bar chart.
## Summarizing total silver awards by country
top_countries_silver <- dt[, .(total_silver = sum(awards_silver, na.rm = TRUE)), by = country]
## Selecting the top 5 countries with the most silver awards
top_5_countries_silver <- top_countries_silver[order(-total_silver)][1:5]
## Silver Awards by Top 5 Countries
plot_silver <- ggplot(top_5_countries_silver, aes(x = reorder(country, total_silver), y = total_silver, fill = country)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = "Top 5 Countries by Total Silver Awards",
x = "Country",
y = "Total Silver Awards"
) +
theme_minimal() +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12, angle = 45, hjust = 1)) +
coord_flip()
plot_silver
Now we can see that Hungary leads with over 150 silver awards, while the USA is in 5th place.
## Now let's see the proportion of Total Bronze Awards by Top 5 Countries
# Summarize total bronze awards by country
top_countries_bronze <- dt[, .(total_bronze = sum(awards_bronze, na.rm = TRUE)), by = country]
## Select the top 5 countries with the most bronze awards
top_5_countries_bronze <- top_countries_bronze[order(-total_bronze)][1:5]
## Create a pie chart
plot_bronze_pie <- ggplot(top_5_countries_bronze, aes(x = "", y = total_bronze, fill = country)) +
geom_bar(stat = "identity", width = 1, color = "#636363") +
coord_polar(theta = "y") +
labs(
title = "Proportion of Total Bronze Awards by Top 5 Countries",
x = NULL,
y = NULL
) +
theme_minimal() +
scale_fill_brewer(palette = "Pastel1") +
theme(axis.text.x = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = scales::percent(total_bronze / sum(total_bronze))),
position = position_stack(vjust = 0.5), color = "#636363")
plot_bronze_pie
As we can see above, the pie chart for the Proportion of Total Bronze Awards by Top 5 Countries displays the relative share of bronze awards won by each of the top 5 countries. Each slice of the pie represents a country’s contribution to the total number of bronze awards, offering a clear view of how the bronze awards are distributed among the leading countries.
## Now let's create a group bar chart and see who are top five countries in 2024.
## Filter data for the year 2024
data_2024 <- dt[year == 2024]
## Summarize gold, silver, and bronze awards by country
awards_2024 <- data_2024[, .(
total_gold = sum(awards_gold, na.rm = TRUE),
total_silver = sum(awards_silver, na.rm = TRUE),
total_bronze = sum(awards_bronze, na.rm = TRUE)
), by = country]
## Add a column for the total awards (gold + silver + bronze)
awards_2024[, total_awards := total_gold + total_silver + total_bronze]
## Select the top 5 countries based on total awards
top_5_awards_2024 <- awards_2024[order(-total_awards)][1:5]
## Melt the data to long format for plotting
awards_2024_long <- melt(top_5_awards_2024[, .(country, total_gold, total_silver, total_bronze)],
id.vars = "country",
variable.name = "award_type",
value.name = "total_awards")
## Create a grouped bar chart for top 5 countries
plot_2024_top5_awards <- ggplot(awards_2024_long, aes(x = country, y = total_awards, fill = award_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Gold, Silver, and Bronze Awards by Top 5 Countries (2024)",
x = "Country",
y = "Number of Awards",
fill = "Award Type"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_2024_top5_awards
The grouped bar chart for Gold, Silver, and Bronze Awards by Top 5 Countries (2024) shows the number of each type of award won by the top 5 countries in the year 2024.
## Now let's focus on distriboution by gender. Let's see Distribution of Participation by Gender Over the Years.
## Aggregate the team size by gender (male and female) for each country and year
gender_distribution_by_year <- dt[, .(
total_male = sum(team_size_male, na.rm = TRUE),
total_female = sum(team_size_female, na.rm = TRUE)
), by = year]
## Reshape the data to have gender types in one column
gender_distribution_long <- melt(gender_distribution_by_year, id.vars = "year",
variable.name = "gender", value.name = "total_participants")
# Create a stacked bar chart to show the distribution of male and female participants by year
plot_gender_distribution <- ggplot(gender_distribution_long, aes(x = factor(year), y = total_participants, fill = gender)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("total_male" = "#1f77b4", "total_female" = "#ff7f0e")) +
labs(
title = "Distribution of Participation by Gender Over the Years",
x = "Year",
y = "Total Participants",
fill = "Gender"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)
)
# Print the plot
print(plot_gender_distribution)
Now, the plot of the Distribution of Participation by Gender Over the Years visualizes the number of male and female participants at the International Mathematical Olympiad each year. The stacked bar chart shows the total number of participants each year, with the height of each bar segment representing the number of male and female participants respectively.
## Now let's focus in individual country. Let's for example Kosovo.
# Filter data for Kosovo
kosovo_data <- dt[country == "Kosovo"]
# Summarize total awards (gold, silver, bronze, honorable mentions) by year
kosovo_awards_by_year <- kosovo_data[, .(
total_gold = sum(awards_gold, na.rm = TRUE),
total_silver = sum(awards_silver, na.rm = TRUE),
total_bronze = sum(awards_bronze, na.rm = TRUE),
total_honorable_mentions = sum(awards_honorable_mentions, na.rm = TRUE)
), by = year]
# Calculate total awards for each year by summing all award types
kosovo_awards_by_year[, total_awards := total_gold + total_silver + total_bronze + total_honorable_mentions]
# Create a plot to show the total awards by year for Kosovo
plot_kosovo_awards_by_year <- ggplot(kosovo_awards_by_year, aes(x = year, y = total_awards)) +
geom_line(color = "#0073e6", linewidth = 1.2) + # Blue line for total awards
geom_point(color = "#ff5733", linewidth = 3) + # Red points to highlight the years
labs(
title = "Total Awards Won by Kosovo by Year",
x = "Year",
y = "Total Awards"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14)
)
# Display the plot
plot_kosovo_awards_by_year
As we can see above, the plot of Total Awards Won by Kosovo by Year shows the total number of awards won by Kosovo each year at the International Mathematical Olympiad. The plot uses a line graph with points to display the trend in awards over time.
individual_results_df <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-09-24/individual_results_df.csv', show_col_types = FALSE)
# Next step is to merge, so let's do by the following code. I will merge bon 'year and country'
merged_df <- merge(country_results_df, individual_results_df, by = c("year", "country"))
## Let's see Individual vs Country Performance
# Scatter plot for individual total scores vs country awards
individual_vs_country <- merged_df %>%
group_by(country) %>%
summarize(
avg_individual_score = mean(total, na.rm = TRUE),
total_country_awards = sum(awards_gold + awards_silver + awards_bronze, na.rm = TRUE)
)
# Plot
ggplot(individual_vs_country, aes(x = avg_individual_score, y = total_country_awards, label = country)) +
geom_point(size = 3, color = "#7fcdbb") +
geom_text(size = 3, hjust = -0.1) +
labs(
title = "Individual vs Country Performance",
x = "Average Individual Score",
y = "Total Country Awards"
) +
theme_minimal()
As we can see the scatter plot for Individual vs Country Performance allows for observation of a correlation between a country’s average individual scores and its total country awards. The plot shows each country represented by a point, with its horizontal position representing the average individual score of all participants from that country, and its vertical position representing the total number of awards (gold, silver, and bronze) won by that country.
## Now, let's see the Distribution of Total Score by Country, focusing in Albania and Kosovo
# Set up a dynamic color palette using colorRampPalette
color_palette <- colorRampPalette(brewer.pal(9, "Set3"))(2) # Only two countries: Albania and Kosovo
# Filter data for Albania and Kosovo
merged_df_filtered <- merged_df %>%
filter(country %in% c("Albania", "Kosovo"))
# Create the box plot
ggplot(merged_df_filtered, aes(x = country, y = total, fill = country)) +
geom_boxplot(outlier.size = 2, outlier.colour = "#de2d26", alpha = 0.7) + # Box plot with outliers highlighted
scale_fill_manual(values = color_palette) + # Dynamically apply the color palette
labs(
title = "Distribution of Total Scores by Country",
subtitle = "Comparing the spread of scores between Albania and Kosovo",
x = "Country",
y = "Total Score",
caption = "Data from International Mathematical Olympiad"
) +
theme_minimal(base_size = 14) + # Apply a minimal theme with adjusted base size for readability
theme(
legend.position = "none", # No legend needed for this plot
axis.text.x = element_text(angle = 90, hjust = 1) # Rotate x-axis labels for better readability
)
Now, we see the box plot of the Distribution of Total Scores by Country compares the spread of total scores between Albania and Kosovo. Based on this, Kosovo’s scores appear to have a wider range, as indicated by the larger box and more spread-out outliers. This suggests there is more variability in individual performance of Kosovo participants. Albania’s scores are more tightly clustered, with less variation around the median. The outliers are also less dispersed compared to Kosovo. This suggests more consistency in the scores achieved by Albanian participants.