Introduction: https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-09-24/country_results_df.csv The Country Results Dataset contains team and individual performance data from various countries participating in competitions. It includes:
Team size (all, male, female) and participant scores (p1 to p7). Award counts: Gold, silver, bronze, and honorable mentions. Leaders and deputy leaders for each country’s team.
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
# Step 1: Load the Dataset
country_data <- read.csv("/Users/aribarazzaq/Desktop/country_results_df.csv")
# Inspect the structure and first few rows
str(country_data)
## 'data.frame': 3780 obs. of 18 variables:
## $ year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ country : chr "United States of America" "People's Republic of China" "Republic of Korea" "India" ...
## $ team_size_all : int 6 6 6 6 6 6 6 6 6 6 ...
## $ team_size_male : int 5 6 6 6 6 6 6 6 6 5 ...
## $ team_size_female : int 1 0 0 0 0 0 0 0 0 1 ...
## $ p1 : int 42 42 42 42 42 42 42 42 42 38 ...
## $ p2 : int 41 42 37 34 30 37 33 37 25 37 ...
## $ p3 : int 19 31 18 11 10 7 8 16 5 5 ...
## $ p4 : int 40 40 42 42 42 42 42 36 42 42 ...
## $ p5 : int 35 22 7 28 36 29 31 23 35 12 ...
## $ p6 : int 15 13 22 10 5 5 6 1 2 17 ...
## $ p7 : int NA NA NA NA NA NA NA NA NA NA ...
## $ awards_gold : int 5 5 2 4 4 1 2 2 1 2 ...
## $ awards_silver : int 1 1 4 1 0 5 3 3 4 2 ...
## $ awards_bronze : int 0 0 0 0 2 0 1 1 1 2 ...
## $ awards_honorable_mentions: int 0 0 0 1 0 0 0 0 0 0 ...
## $ leader : chr "John Berman" "Liang Xiao" "Suyoung Choi" "Krishnan Sivasubramanian" ...
## $ deputy_leader : chr "Carl Schildkraut" "Yijun Yao" "Hwajong Yoo" "Rijul Saini" ...
head(country_data)
## year country team_size_all team_size_male team_size_female
## 1 2024 United States of America 6 5 1
## 2 2024 People's Republic of China 6 6 0
## 3 2024 Republic of Korea 6 6 0
## 4 2024 India 6 6 0
## 5 2024 Belarus 6 6 0
## 6 2024 Singapore 6 6 0
## p1 p2 p3 p4 p5 p6 p7 awards_gold awards_silver awards_bronze
## 1 42 41 19 40 35 15 NA 5 1 0
## 2 42 42 31 40 22 13 NA 5 1 0
## 3 42 37 18 42 7 22 NA 2 4 0
## 4 42 34 11 42 28 10 NA 4 1 0
## 5 42 30 10 42 36 5 NA 4 0 2
## 6 42 37 7 42 29 5 NA 1 5 0
## awards_honorable_mentions leader deputy_leader
## 1 0 John Berman Carl Schildkraut
## 2 0 Liang Xiao Yijun Yao
## 3 0 Suyoung Choi Hwajong Yoo
## 4 1 Krishnan Sivasubramanian Rijul Saini
## 5 0 David Zmiaikou Dzmitry Bazyleu
## 6 0 Yong Sheng Soh Teck Kian Teo
# Step 2: Tidy the Data
# Pivot participant scores (p1 to p7) from wide to long format
country_long <- country_data %>%
pivot_longer(
cols = starts_with("p"), # Pivot all p1 to p7 columns
names_to = "participant",
values_to = "score"
)
# Preview the tidied data
head(country_long)
## # A tibble: 6 × 13
## year country team_size_all team_size_male team_size_female awards_gold
## <int> <chr> <int> <int> <int> <int>
## 1 2024 United States… 6 5 1 5
## 2 2024 United States… 6 5 1 5
## 3 2024 United States… 6 5 1 5
## 4 2024 United States… 6 5 1 5
## 5 2024 United States… 6 5 1 5
## 6 2024 United States… 6 5 1 5
## # ℹ 7 more variables: awards_silver <int>, awards_bronze <int>,
## # awards_honorable_mentions <int>, leader <chr>, deputy_leader <chr>,
## # participant <chr>, score <int>
# Step 3: Data Transformation and Analysis
## 3.1: Calculate the average participant score per country
average_scores <- country_long %>%
group_by(country) %>%
summarise(avg_score = mean(score, na.rm = TRUE)) %>%
arrange(desc(avg_score))
# View the average scores
print(average_scores)
## # A tibble: 139 × 2
## country avg_score
## <chr> <dbl>
## 1 Union of Soviet Socialist Republics 38.7
## 2 People's Republic of China 34.2
## 3 United States of America 32.3
## 4 German Democratic Republic 32.0
## 5 Russian Federation 30.7
## 6 Commonwealth of Independent States 29.3
## 7 Hungary 28.5
## 8 Romania 28.3
## 9 Republic of Korea 27.8
## 10 Vietnam 26.4
## # ℹ 129 more rows
## 3.2: Summarize total awards by country
awards_summary <- country_data %>%
group_by(country) %>%
summarise(
total_gold = sum(awards_gold, na.rm = TRUE),
total_silver = sum(awards_silver, na.rm = TRUE),
total_bronze = sum(awards_bronze, na.rm = TRUE),
total_honorable_mentions = sum(awards_honorable_mentions, na.rm = TRUE)
)
# View the awards summary
print(awards_summary)
## # A tibble: 139 × 5
## country total_gold total_silver total_bronze total_honorable_mentions
## <chr> <int> <int> <int> <int>
## 1 Albania 0 2 9 45
## 2 Algeria 1 2 8 24
## 3 Angola 0 0 0 0
## 4 Argentina 6 28 72 51
## 5 Armenia 1 30 76 46
## 6 Australia 29 78 102 21
## 7 Austria 13 40 112 64
## 8 Azerbaijan 0 10 45 68
## 9 Bahrain 0 0 0 1
## 10 Bangladesh 1 7 37 44
## # ℹ 129 more rows
# Step 4: Visualization - Average Participant Scores by Country
# Filter the top 20 countries by average score and wrap long names
top_countries <- average_scores %>%
slice_head(n = 20) %>%
mutate(country = str_wrap(country, width = 15)) # Wrap long names for readability
# Create a polished plot for top 20 countries by average score
ggplot(top_countries, aes(x = reorder(country, avg_score), y = avg_score, fill = avg_score)) +
geom_bar(stat = "identity") +
theme_minimal(base_size = 15) + # Adjust base font size for readability
coord_flip() + # Horizontal bars for better fit
scale_fill_gradient(low = "lightblue", high = "darkblue") + # Color gradient
labs(
title = "Top 20 Countries by Average Participant Score",
x = "Country", y = "Average Score"
) +
theme(
axis.text.y = element_text(size = 10), # Adjust y-axis text size
axis.text.x = element_text(size = 12), # Adjust x-axis text size
plot.title = element_text(hjust = 0.5, size = 18, face = "bold"), # Centered, bold title
legend.position = "none" # Remove legend for simplicity
)
Data Tidying:
Used pivot_longer() to reshape participant scores into long format. Data Transformation:
Calculated average participant scores per country. Summarized total awards (gold, silver, bronze, honorable mentions) per country. Visualization:
Created a horizontal bar plot with the top 20 countries by average score. Used color gradients to improve visual appeal and wrapped long country names for better readability.