Project 2 b

Introduction: https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-09-24/country_results_df.csv The Country Results Dataset contains team and individual performance data from various countries participating in competitions. It includes:

Team size (all, male, female) and participant scores (p1 to p7). Award counts: Gold, silver, bronze, and honorable mentions. Leaders and deputy leaders for each country’s team.

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)

# Step 1: Load the Dataset
country_data <- read.csv("/Users/aribarazzaq/Desktop/country_results_df.csv")

# Inspect the structure and first few rows
str(country_data)

## 'data.frame':    3780 obs. of  18 variables:
##  $ year                     : int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
##  $ country                  : chr  "United States of America" "People's Republic of China" "Republic of Korea" "India" ...
##  $ team_size_all            : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ team_size_male           : int  5 6 6 6 6 6 6 6 6 5 ...
##  $ team_size_female         : int  1 0 0 0 0 0 0 0 0 1 ...
##  $ p1                       : int  42 42 42 42 42 42 42 42 42 38 ...
##  $ p2                       : int  41 42 37 34 30 37 33 37 25 37 ...
##  $ p3                       : int  19 31 18 11 10 7 8 16 5 5 ...
##  $ p4                       : int  40 40 42 42 42 42 42 36 42 42 ...
##  $ p5                       : int  35 22 7 28 36 29 31 23 35 12 ...
##  $ p6                       : int  15 13 22 10 5 5 6 1 2 17 ...
##  $ p7                       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ awards_gold              : int  5 5 2 4 4 1 2 2 1 2 ...
##  $ awards_silver            : int  1 1 4 1 0 5 3 3 4 2 ...
##  $ awards_bronze            : int  0 0 0 0 2 0 1 1 1 2 ...
##  $ awards_honorable_mentions: int  0 0 0 1 0 0 0 0 0 0 ...
##  $ leader                   : chr  "John Berman" "Liang Xiao" "Suyoung Choi" "Krishnan Sivasubramanian" ...
##  $ deputy_leader            : chr  "Carl Schildkraut" "Yijun Yao" "Hwajong Yoo" "Rijul Saini" ...

head(country_data)

##   year                    country team_size_all team_size_male team_size_female
## 1 2024   United States of America             6              5                1
## 2 2024 People's Republic of China             6              6                0
## 3 2024          Republic of Korea             6              6                0
## 4 2024                      India             6              6                0
## 5 2024                    Belarus             6              6                0
## 6 2024                  Singapore             6              6                0
##   p1 p2 p3 p4 p5 p6 p7 awards_gold awards_silver awards_bronze
## 1 42 41 19 40 35 15 NA           5             1             0
## 2 42 42 31 40 22 13 NA           5             1             0
## 3 42 37 18 42  7 22 NA           2             4             0
## 4 42 34 11 42 28 10 NA           4             1             0
## 5 42 30 10 42 36  5 NA           4             0             2
## 6 42 37  7 42 29  5 NA           1             5             0
##   awards_honorable_mentions                   leader    deputy_leader
## 1                         0              John Berman Carl Schildkraut
## 2                         0               Liang Xiao        Yijun Yao
## 3                         0             Suyoung Choi      Hwajong Yoo
## 4                         1 Krishnan Sivasubramanian      Rijul Saini
## 5                         0           David Zmiaikou  Dzmitry Bazyleu
## 6                         0           Yong Sheng Soh    Teck Kian Teo

# Step 2: Tidy the Data
# Pivot participant scores (p1 to p7) from wide to long format
country_long <- country_data %>%
  pivot_longer(
    cols = starts_with("p"),  # Pivot all p1 to p7 columns
    names_to = "participant", 
    values_to = "score"
  )

# Preview the tidied data
head(country_long)

## # A tibble: 6 × 13
##    year country        team_size_all team_size_male team_size_female awards_gold
##   <int> <chr>                  <int>          <int>            <int>       <int>
## 1  2024 United States…             6              5                1           5
## 2  2024 United States…             6              5                1           5
## 3  2024 United States…             6              5                1           5
## 4  2024 United States…             6              5                1           5
## 5  2024 United States…             6              5                1           5
## 6  2024 United States…             6              5                1           5
## # ℹ 7 more variables: awards_silver <int>, awards_bronze <int>,
## #   awards_honorable_mentions <int>, leader <chr>, deputy_leader <chr>,
## #   participant <chr>, score <int>

# Step 3: Data Transformation and Analysis

## 3.1: Calculate the average participant score per country
average_scores <- country_long %>%
  group_by(country) %>%
  summarise(avg_score = mean(score, na.rm = TRUE)) %>%
  arrange(desc(avg_score))

# View the average scores
print(average_scores)

## # A tibble: 139 × 2
##    country                             avg_score
##    <chr>                                   <dbl>
##  1 Union of Soviet Socialist Republics      38.7
##  2 People's Republic of China               34.2
##  3 United States of America                 32.3
##  4 German Democratic Republic               32.0
##  5 Russian Federation                       30.7
##  6 Commonwealth of Independent States       29.3
##  7 Hungary                                  28.5
##  8 Romania                                  28.3
##  9 Republic of Korea                        27.8
## 10 Vietnam                                  26.4
## # ℹ 129 more rows

## 3.2: Summarize total awards by country
awards_summary <- country_data %>%
  group_by(country) %>%
  summarise(
    total_gold = sum(awards_gold, na.rm = TRUE),
    total_silver = sum(awards_silver, na.rm = TRUE),
    total_bronze = sum(awards_bronze, na.rm = TRUE),
    total_honorable_mentions = sum(awards_honorable_mentions, na.rm = TRUE)
  )

# View the awards summary
print(awards_summary)

## # A tibble: 139 × 5
##    country    total_gold total_silver total_bronze total_honorable_mentions
##    <chr>           <int>        <int>        <int>                    <int>
##  1 Albania             0            2            9                       45
##  2 Algeria             1            2            8                       24
##  3 Angola              0            0            0                        0
##  4 Argentina           6           28           72                       51
##  5 Armenia             1           30           76                       46
##  6 Australia          29           78          102                       21
##  7 Austria            13           40          112                       64
##  8 Azerbaijan          0           10           45                       68
##  9 Bahrain             0            0            0                        1
## 10 Bangladesh          1            7           37                       44
## # ℹ 129 more rows

# Step 4: Visualization - Average Participant Scores by Country

# Filter the top 20 countries by average score and wrap long names
top_countries <- average_scores %>%
  slice_head(n = 20) %>%
  mutate(country = str_wrap(country, width = 15))  # Wrap long names for readability

# Create a polished plot for top 20 countries by average score
ggplot(top_countries, aes(x = reorder(country, avg_score), y = avg_score, fill = avg_score)) +
  geom_bar(stat = "identity") +
  theme_minimal(base_size = 15) +  # Adjust base font size for readability
  coord_flip() +  # Horizontal bars for better fit
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  # Color gradient
  labs(
    title = "Top 20 Countries by Average Participant Score",
    x = "Country", y = "Average Score"
  ) +
  theme(
    axis.text.y = element_text(size = 10),  # Adjust y-axis text size
    axis.text.x = element_text(size = 12),  # Adjust x-axis text size
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),  # Centered, bold title
    legend.position = "none"  # Remove legend for simplicity
  )

Data Tidying:

Used pivot_longer() to reshape participant scores into long format. Data Transformation:

Calculated average participant scores per country. Summarized total awards (gold, silver, bronze, honorable mentions) per country. Visualization:

Created a horizontal bar plot with the top 20 countries by average score. Used color gradients to improve visual appeal and wrapped long country names for better readability.