Load required the libraries
library(tidyr)
library(treemapify)
library(tidyverse)
library(rvest)
library(dplyr)
library(DT)
library(fmsb)
library(kableExtra)
Wikipedia page URL
url <- "https://en.wikipedia.org/wiki/2024_Summer_Olympics_medal_table"
# Read the full HTML of the page
page <- read_html(url)
# Extract all tables
tables <- page %>% html_elements("table")
Convert all the tables in the page to data frames
all_tables <- lapply(tables, html_table, fill = TRUE)
medal_table <- all_tables[[4]]
# view the data
head(medal_table)
Remove the last row of the data
df_new <- medal_table[-nrow(medal_table), ]
Transform the character variable to integers
df_new$Total = as.numeric(df_new$Total)
df_new$Rank = as.numeric(df_new$Rank)
# view the structure of the data
str(df_new)
## tibble [92 × 6] (S3: tbl_df/tbl/data.frame)
## $ Rank : num [1:92] 1 2 3 4 5 6 7 8 9 10 ...
## $ NOC : chr [1:92] "United States‡" "China" "Japan" "Australia" ...
## $ Gold : int [1:92] 40 40 20 18 16 15 14 13 12 12 ...
## $ Silver: int [1:92] 44 27 12 19 26 7 22 9 13 13 ...
## $ Bronze: int [1:92] 42 24 13 16 22 12 29 10 15 8 ...
## $ Total : num [1:92] 126 91 45 53 64 34 65 32 40 33 ...
Top 10 Countries by Gold Medals
top10_gold <- df_new %>%
arrange(desc(Gold)) %>%
slice(1:10)
print(top10_gold)
## # A tibble: 10 × 6
## Rank NOC Gold Silver Bronze Total
## <dbl> <chr> <int> <int> <int> <dbl>
## 1 1 United States‡ 40 44 42 126
## 2 2 China 40 27 24 91
## 3 3 Japan 20 12 13 45
## 4 4 Australia 18 19 16 53
## 5 5 France* 16 26 22 64
## 6 6 Netherlands 15 7 12 34
## 7 7 Great Britain 14 22 29 65
## 8 8 South Korea 13 9 10 32
## 9 9 Italy 12 13 15 40
## 10 10 Germany 12 13 8 33
# Create scrollable table
# Create a scrollable table
kable(top10_gold, "html") %>%
kable_styling(bootstrap_options = c("striped", "hover",
"condensed","responsive")) %>%
scroll_box(width = "100%", height = "300px")
| Rank | NOC | Gold | Silver | Bronze | Total |
|---|---|---|---|---|---|
| 1 | United States‡ | 40 | 44 | 42 | 126 |
| 2 | China | 40 | 27 | 24 | 91 |
| 3 | Japan | 20 | 12 | 13 | 45 |
| 4 | Australia | 18 | 19 | 16 | 53 |
| 5 | France* | 16 | 26 | 22 | 64 |
| 6 | Netherlands | 15 | 7 | 12 | 34 |
| 7 | Great Britain | 14 | 22 | 29 | 65 |
| 8 | South Korea | 13 | 9 | 10 | 32 |
| 9 | Italy | 12 | 13 | 15 | 40 |
| 10 | Germany | 12 | 13 | 8 | 33 |
Visualize the top 10 countries by Total Medals
library(ggplot2)
bar_colors <- rainbow(10)
ggplot(top10_gold, aes(x = reorder(NOC, Total), y = Total, fill = NOC)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = bar_colors) +
coord_flip() +
labs(
title = "Top 10 Countries by Total Medals - Paris 2024 ",
x = "Country", y = "Total Medals"
)
Top 5 Countries by Gold Medals
top5_gold <- df_new %>%
arrange(desc(Gold)) %>%
slice(1:5)
print(top5_gold)
## # A tibble: 5 × 6
## Rank NOC Gold Silver Bronze Total
## <dbl> <chr> <int> <int> <int> <dbl>
## 1 1 United States‡ 40 44 42 126
## 2 2 China 40 27 24 91
## 3 3 Japan 20 12 13 45
## 4 4 Australia 18 19 16 53
## 5 5 France* 16 26 22 64
Top 5 Countries by Gold Medals
# Create a scrollable table
kable(top5_gold, "html") %>%
kable_styling(bootstrap_options = c("striped", "hover",
"condensed","responsive")) %>%
scroll_box(width = "100%", height = "300px")
| Rank | NOC | Gold | Silver | Bronze | Total |
|---|---|---|---|---|---|
| 1 | United States‡ | 40 | 44 | 42 | 126 |
| 2 | China | 40 | 27 | 24 | 91 |
| 3 | Japan | 20 | 12 | 13 | 45 |
| 4 | Australia | 18 | 19 | 16 | 53 |
| 5 | France* | 16 | 26 | 22 | 64 |
# Create donut chart
ggplot(top5_gold, aes(x = 2, y = Gold, fill = NOC)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y", start = 0) +
xlim(0.5, 2.5) + # Creates the donut hole
theme_void() +
geom_text(aes(label = Gold),
position = position_stack(vjust = 0.5),
color = "white", size = 5) +
theme(legend.title = element_blank()) +
ggtitle("Top 5 Countries by Total Medals - Paris 2024") +
theme(plot.title = element_text(hjust = 0.5))
# Select Top 5 Countries by Bronze count
top5_bronze <- df_new %>%
arrange(desc(Bronze)) %>%
slice(1:5)
# Expand rows based on Bronze counts
bronze_expanded <- top5_bronze %>%
rowwise() %>%
mutate(values = list(rep(NOC, Bronze))) %>%
unnest(values)
# Create box plot
ggplot(bronze_expanded, aes(x = NOC, y = Bronze, fill = NOC)) +
geom_boxplot() +
labs(
title = "Top 5 Countries by Bronze Medals",
x = "Country",
y = "Number of Bronze Medals"
) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5))
Medal Count Region
continent_map <- tibble::tribble(
~NOC, ~Continent,
"United States", "North America",
"China", "Asia",
"Japan", "Asia",
"Australia", "Oceania",
"Canada", "North America",
"Brazil", "South America",
"Kenya", "Africa",
"Ethiopia", "Africa",
"Great Britain", "Europe",
"France", "Europe",
"Germany", "Europe",
"Italy", "Europe",
"Netherlands", "Europe",
"South Korea", "Asia",
"Hungary", "Europe",
"Cuba", "North America",
"South Africa", "Africa"
)
df_continent <- df_new %>%
left_join(continent_map, by = "NOC") %>%
filter(!is.na(Continent)) %>%
group_by(Continent) %>%
summarise(Total_Medals = sum(Total, na.rm = TRUE)) %>%
arrange(desc(Total_Medals))
df_continent
# Merge and summarize the data
ggplot(df_continent, aes(x = reorder(Continent, Total_Medals),
y = Total_Medals, fill = Continent)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Total Medals by Continent - Paris 2024",
x = "Continent", y = "Total Medals") +
theme_minimal()
# Keep only top 5 continents by total medals
top5_continents <- df_continent %>%
arrange(desc(Total_Medals)) %>%
slice(1:5)
# Donut chart
ggplot(top5_continents, aes(x = 2, y = Total_Medals, fill = Continent)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
xlim(0.5, 2.5) +
theme_void() +
geom_text(aes(label = paste0(Continent, "\n", round(Total_Medals / sum(Total_Medals) * 100, 1), "%")),
position = position_stack(vjust = 0.5), size = 4, color = "white") +
ggtitle("Share of Medals by Top 5 Continents - Paris 2024") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(df_new, aes(x = Rank, y = Total)) +
geom_point(color = "steelblue", size = 3) +
geom_text(aes(label = NOC), vjust = -0.5, size = 3) +
labs(title = "Rank vs Total Medals - Paris 2024", x = "Rank", y = "Total Medals") +
theme_minimal()+
theme(plot.title = element_text(hjust = 0.5))
# Sample data
top10 <- df_new %>%
arrange(desc(Total)) %>%
slice_head(n = 10)
ggplot(top10, aes(x = reorder(NOC, Total), y = Total)) +
geom_segment(aes(xend = NOC, y = 0, yend = Total), color = "grey") +
geom_point(size = 5, color = "steelblue") +
coord_flip() +
labs(title = "Top 10 Countries by Total Medals",
x = "", y = "Total Medals") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Prepare data - top 10 countries by total medals
top10 <- df_new %>%
arrange(desc(Total)) %>%
slice_head(n = 10)
# Create treemap
ggplot(top10, aes(area = Total, fill = NOC, label = paste(NOC, "\n", Total))) +
geom_treemap(color = "white") +
geom_treemap_text(colour = "white", place = "centre", grow = TRUE) +
labs(title = "Top 10 Countries by Total Medals (Treemap)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "none")
# Select top 5 countries by total medals
top_countries <- df_new %>%
arrange(desc(Total)) %>%
slice_head(n = 5) %>%
select(NOC, Gold, Silver, Bronze)
# fmsb requires first two rows to be max and min values for scaling
max_vals <- c( max(top_countries$Gold),
max(top_countries$Silver),
max(top_countries$Bronze) )
min_vals <- c(0, 0, 0)
# Prepare data
radar_data <- rbind(max_vals, min_vals, top_countries[, -1])
rownames(radar_data) <- c("Max", "Min", top_countries$NOC)
# Plot Radar Chart
radarchart(radar_data,
axistype = 1,
pcol = rainbow(nrow(radar_data) - 2), # colors for countries
plwd = 2,
plty = 1,
cglcol = "grey", cglty = 1, axislabcol = "black",
cglwd = 0.8,
vlcex = 0.9,
title = "Medal Distribution")
legend("topright", legend = rownames(radar_data)[-c(1, 2)],
col = rainbow(nrow(radar_data) - 2), lty = 1, lwd = 2, cex = 0.8)
# Sample 10 random countries
df_sample <- df_new %>%
sample_n(20) %>%
mutate(Other = Total - Gold) %>%
select(NOC, Gold, Other) %>%
pivot_longer(cols = c(Gold, Other), names_to = "Medal_Type", values_to = "Count")
# Plot stacked 100% bar chart
ggplot(df_sample, aes(x = NOC, y = Count, fill = Medal_Type)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Gold-to-Total Medal Ratio by Country (Sample 20)",
x = "Country",
y = "Percentage"
) +
scale_fill_manual(values = c("Gold" = "gold", "Other" = "gray70")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1)
)