The dataset (athlete_events.csv) comes from the 120
Years of Olympic History Kaggle dataset.
library(tidyverse)
library(plotly)
# Download the data
url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv"
olympics <- read_csv(url)
# Preview
glimpse(olympics)
## Rows: 271,116
## Columns: 15
## $ id <dbl> 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, …
## $ name <chr> "A Dijiang", "A Lamusi", "Gunnar Nielsen Aaby", "Edgar Lindenau…
## $ sex <chr> "M", "M", "M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "M"…
## $ age <dbl> 24, 23, 24, 34, 21, 21, 25, 25, 27, 27, 31, 31, 31, 31, 33, 33,…
## $ height <dbl> 180, 170, NA, NA, 185, 185, 185, 185, 185, 185, 188, 188, 188, …
## $ weight <dbl> 80, 60, NA, NA, 82, 82, 82, 82, 82, 82, 75, 75, 75, 75, 75, 75,…
## $ team <chr> "China", "China", "Denmark", "Denmark/Sweden", "Netherlands", "…
## $ noc <chr> "CHN", "CHN", "DEN", "DEN", "NED", "NED", "NED", "NED", "NED", …
## $ games <chr> "1992 Summer", "2012 Summer", "1920 Summer", "1900 Summer", "19…
## $ year <dbl> 1992, 2012, 1920, 1900, 1988, 1988, 1992, 1992, 1994, 1994, 199…
## $ season <chr> "Summer", "Summer", "Summer", "Summer", "Winter", "Winter", "Wi…
## $ city <chr> "Barcelona", "London", "Antwerpen", "Paris", "Calgary", "Calgar…
## $ sport <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Speed Skating"…
## $ event <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightweight", …
## $ medal <chr> NA, NA, NA, "Gold", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
Remove rows with missing age, keep only useful columns, and add a
Medal_won flag.
athletes_clean <- olympics %>%
filter(!is.na(age)) %>%
mutate(
Medal_won = if_else(!is.na(medal), "Medal", "No medal"),
Decade = (year %/% 10) * 10
)
A box plot showing the spread of athlete ages across the top sports. Hover over any box to see the median, quartiles, and outliers.
top_sports <- athletes_clean %>%
count(sport, sort = TRUE) %>%
slice_head(n = 12) %>%
pull(sport)
p1 <- athletes_clean %>%
filter(sport %in% top_sports) %>%
plot_ly(
x = ~reorder(sport, age, median),
y = ~age,
color = ~sex,
colors = c("F" = "#D4537E", "M" = "#185FA5"),
type = "box",
boxpoints = "outliers",
hovertemplate = "<b>%{x}</b><br>Age: %{y}<extra></extra>"
) %>%
layout(
title = "Age distribution by sport and gender",
xaxis = list(title = "", tickangle = -35),
yaxis = list(title = "Age"),
boxmode = "group",
legend = list(title = list(text = "Gender"))
)
p1
A line chart showing whether athletes are getting older or younger across Olympic history, split by Summer vs Winter games.
age_trend <- athletes_clean %>%
group_by(year, season) %>%
summarise(
mean_age = mean(age),
se = sd(age) / sqrt(n()),
.groups = "drop"
)
p2 <- age_trend %>%
plot_ly(
x = ~year,
y = ~mean_age,
color = ~season,
colors = c("Summer" = "#BA7517", "Winter" = "#185FA5"),
type = "scatter",
mode = "lines+markers",
error_y = list(
type = "data",
array = ~se * 1.96,
visible = TRUE,
thickness = 1
),
hovertemplate = "<b>%{x}</b><br>Mean age: %{y:.1f}<extra></extra>"
) %>%
layout(
title = "Mean athlete age over time",
xaxis = list(title = "Year"),
yaxis = list(title = "Mean age (years)"),
legend = list(title = list(text = "Season"))
)
p2
How has the share of female athletes changed since 1896?
gender_trend <- olympics %>%
distinct(name, year, sex, season) %>%
count(year, season, sex) %>%
group_by(year, season) %>%
mutate(pct = n / sum(n) * 100) %>%
ungroup() %>%
filter(sex == "F")
p3 <- gender_trend %>%
plot_ly(
x = ~year,
y = ~pct,
color = ~season,
colors = c("Summer" = "#BA7517", "Winter" = "#185FA5"),
type = "scatter",
mode = "lines+markers",
fill = "tozeroy",
alpha = 0.4,
hovertemplate = "<b>%{x}</b><br>Female athletes: %{y:.1f}%<extra></extra>"
) %>%
layout(
title = "Percentage of female athletes over time",
xaxis = list(title = "Year"),
yaxis = list(title = "% female", range = c(0, 55)),
legend = list(title = list(text = "Season"))
)
p3
Select a sport from the dropdown to see whether older or younger athletes tend to win medals.
sports_list <- athletes_clean %>%
count(sport, sort = TRUE) %>%
slice_head(n = 15) %>%
pull(sport)
traces <- list()
buttons <- list()
for (i in seq_along(sports_list)) {
sp <- sports_list[i]
df <- athletes_clean %>% filter(sport == sp)
traces[[i]] <- list(
x = df$age,
y = jitter(as.numeric(factor(df$sex)), amount = 0.15),
type = "scatter",
mode = "markers",
name = sp,
visible = (i == 1),
marker = list(
color = ifelse(df$Medal_won == "Medal", "#BA7517", "#B5D4F4"),
size = 6,
opacity = 0.65,
line = list(width = 0)
),
text = paste0("Age: ", df$age,
"<br>Gender: ", df$sex,
"<br>Year: ", df$year,
"<br>", df$Medal_won),
hoverinfo = "text"
)
vis_vec <- rep(FALSE, length(sports_list))
vis_vec[i] <- TRUE
buttons[[i]] <- list(
method = "update",
args = list(list(visible = as.list(vis_vec)),
list(title = paste("Age vs gender —", sp))),
label = sp
)
}
p4 <- plot_ly()
for (tr in traces) {
p4 <- add_trace(p4,
x = tr$x,
y = tr$y,
type = tr$type,
mode = tr$mode,
name = tr$name,
visible = tr$visible,
marker = tr$marker,
text = tr$text,
hoverinfo = tr$hoverinfo
)
}
p4 <- p4 %>%
layout(
title = paste("Age vs gender —", sports_list[1]),
xaxis = list(title = "Age"),
yaxis = list(
title = "Gender",
tickvals = c(1, 2),
ticktext = c("F", "M")
),
updatemenus = list(list(
type = "dropdown",
buttons = buttons,
x = 0.01,
xanchor = "left",
y = 1.15,
yanchor = "top"
)),
legend = list(title = list(text = ""))
)
p4