Setup

The dataset (athlete_events.csv) comes from the 120 Years of Olympic History Kaggle dataset.

library(tidyverse)
library(plotly)

# Download the data
url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv"
olympics <- read_csv(url)

# Preview
glimpse(olympics)
## Rows: 271,116
## Columns: 15
## $ id     <dbl> 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, …
## $ name   <chr> "A Dijiang", "A Lamusi", "Gunnar Nielsen Aaby", "Edgar Lindenau…
## $ sex    <chr> "M", "M", "M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "M"…
## $ age    <dbl> 24, 23, 24, 34, 21, 21, 25, 25, 27, 27, 31, 31, 31, 31, 33, 33,…
## $ height <dbl> 180, 170, NA, NA, 185, 185, 185, 185, 185, 185, 188, 188, 188, …
## $ weight <dbl> 80, 60, NA, NA, 82, 82, 82, 82, 82, 82, 75, 75, 75, 75, 75, 75,…
## $ team   <chr> "China", "China", "Denmark", "Denmark/Sweden", "Netherlands", "…
## $ noc    <chr> "CHN", "CHN", "DEN", "DEN", "NED", "NED", "NED", "NED", "NED", …
## $ games  <chr> "1992 Summer", "2012 Summer", "1920 Summer", "1900 Summer", "19…
## $ year   <dbl> 1992, 2012, 1920, 1900, 1988, 1988, 1992, 1992, 1994, 1994, 199…
## $ season <chr> "Summer", "Summer", "Summer", "Summer", "Winter", "Winter", "Wi…
## $ city   <chr> "Barcelona", "London", "Antwerpen", "Paris", "Calgary", "Calgar…
## $ sport  <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Speed Skating"…
## $ event  <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightweight", …
## $ medal  <chr> NA, NA, NA, "Gold", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

Data cleaning

Remove rows with missing age, keep only useful columns, and add a Medal_won flag.

athletes_clean <- olympics %>%
  filter(!is.na(age)) %>%
  mutate(
    Medal_won = if_else(!is.na(medal), "Medal", "No medal"),
    Decade     = (year %/% 10) * 10
  )

Plot 1 — Age distribution by sport

A box plot showing the spread of athlete ages across the top sports. Hover over any box to see the median, quartiles, and outliers.

top_sports <- athletes_clean %>%
  count(sport, sort = TRUE) %>%
  slice_head(n = 12) %>%
  pull(sport)

p1 <- athletes_clean %>%
  filter(sport %in% top_sports) %>%
  plot_ly(
    x         = ~reorder(sport, age, median),
    y         = ~age,
    color     = ~sex,
    colors    = c("F" = "#D4537E", "M" = "#185FA5"),
    type      = "box",
    boxpoints = "outliers",
    hovertemplate = "<b>%{x}</b><br>Age: %{y}<extra></extra>"
  ) %>%
  layout(
    title   = "Age distribution by sport and gender",
    xaxis   = list(title = "", tickangle = -35),
    yaxis   = list(title = "Age"),
    boxmode = "group",
    legend  = list(title = list(text = "Gender"))
  )

p1

Plot 2 — Average age over time, by season

A line chart showing whether athletes are getting older or younger across Olympic history, split by Summer vs Winter games.

age_trend <- athletes_clean %>%
  group_by(year, season) %>%
  summarise(
    mean_age = mean(age),
    se       = sd(age) / sqrt(n()),
    .groups  = "drop"
  )

p2 <- age_trend %>%
  plot_ly(
    x      = ~year,
    y      = ~mean_age,
    color  = ~season,
    colors = c("Summer" = "#BA7517", "Winter" = "#185FA5"),
    type   = "scatter",
    mode   = "lines+markers",
    error_y = list(
      type      = "data",
      array     = ~se * 1.96,
      visible   = TRUE,
      thickness = 1
    ),
    hovertemplate = "<b>%{x}</b><br>Mean age: %{y:.1f}<extra></extra>"
  ) %>%
  layout(
    title  = "Mean athlete age over time",
    xaxis  = list(title = "Year"),
    yaxis  = list(title = "Mean age (years)"),
    legend = list(title = list(text = "Season"))
  )

p2

Plot 3 — Gender participation over time

How has the share of female athletes changed since 1896?

gender_trend <- olympics %>%
  distinct(name, year, sex, season) %>%
  count(year, season, sex) %>%
  group_by(year, season) %>%
  mutate(pct = n / sum(n) * 100) %>%
  ungroup() %>%
  filter(sex == "F")

p3 <- gender_trend %>%
  plot_ly(
    x         = ~year,
    y         = ~pct,
    color     = ~season,
    colors    = c("Summer" = "#BA7517", "Winter" = "#185FA5"),
    type      = "scatter",
    mode      = "lines+markers",
    fill      = "tozeroy",
    alpha     = 0.4,
    hovertemplate = "<b>%{x}</b><br>Female athletes: %{y:.1f}%<extra></extra>"
  ) %>%
  layout(
    title  = "Percentage of female athletes over time",
    xaxis  = list(title = "Year"),
    yaxis  = list(title = "% female", range = c(0, 55)),
    legend = list(title = list(text = "Season"))
  )

p3

Plot 4 — Interactive scatter: age vs medal success by sport

Select a sport from the dropdown to see whether older or younger athletes tend to win medals.

sports_list <- athletes_clean %>%
  count(sport, sort = TRUE) %>%
  slice_head(n = 15) %>%
  pull(sport)

traces  <- list()
buttons <- list()

for (i in seq_along(sports_list)) {
  sp <- sports_list[i]
  df <- athletes_clean %>% filter(sport == sp)

  traces[[i]] <- list(
    x         = df$age,
    y         = jitter(as.numeric(factor(df$sex)), amount = 0.15),
    type      = "scatter",
    mode      = "markers",
    name      = sp,
    visible   = (i == 1),
    marker    = list(
      color   = ifelse(df$Medal_won == "Medal", "#BA7517", "#B5D4F4"),
      size    = 6,
      opacity = 0.65,
      line    = list(width = 0)
    ),
    text      = paste0("Age: ", df$age,
                       "<br>Gender: ", df$sex,
                       "<br>Year: ", df$year,
                       "<br>", df$Medal_won),
    hoverinfo = "text"
  )

  vis_vec    <- rep(FALSE, length(sports_list))
  vis_vec[i] <- TRUE
  buttons[[i]] <- list(
    method = "update",
    args   = list(list(visible = as.list(vis_vec)),
                  list(title = paste("Age vs gender —", sp))),
    label  = sp
  )
}

p4 <- plot_ly()
for (tr in traces) {
  p4 <- add_trace(p4,
    x         = tr$x,
    y         = tr$y,
    type      = tr$type,
    mode      = tr$mode,
    name      = tr$name,
    visible   = tr$visible,
    marker    = tr$marker,
    text      = tr$text,
    hoverinfo = tr$hoverinfo
  )
}

p4 <- p4 %>%
  layout(
    title  = paste("Age vs gender —", sports_list[1]),
    xaxis  = list(title = "Age"),
    yaxis  = list(
      title    = "Gender",
      tickvals = c(1, 2),
      ticktext = c("F", "M")
    ),
    updatemenus = list(list(
      type     = "dropdown",
      buttons  = buttons,
      x        = 0.01,
      xanchor  = "left",
      y        = 1.15,
      yanchor  = "top"
    )),
    legend = list(title = list(text = ""))
  )

p4