Tokyo 2020 Olympic Swimming Performance Analysis using SwimmeR and Regression Modelling

Vignesh Venkatesh
2025-11-03

Overview

This document provides a reproducible, step-by-step analysis of Tokyo 2020 Olympic swimming events using the SwimmeR package in R. It is designed for complete novices: anyone with the R environment and the Tokyo 2020 swimming data folder can follow along.

You will learn to:

Prerequisites

Ensure the working directory contains the following folder structure:

Tokyo 2020 swimming data/
 ├── Men's individual/
 │    ├── SWMM100MFR.pdf
 │    └── ...
 ├── Women's individual/
 │    ├── SWMW100MFR.pdf
 │    └── ...

Load libraries

library(SwimmeR)
library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
library(caret)
library(Metrics)

1. Import Data

We first list all PDF files for men and women separately.

base_dir <- "Tokyo 2020 swimming data/Tokyo 2020 swimming data/"
men_dir <- file.path(base_dir, "Men's individual")
women_dir <- file.path(base_dir, "Women's individual")

men_pdfs <- list.files(path = men_dir, pattern = "\\.pdf$", full.names = TRUE)
women_pdfs <- list.files(path = women_dir, pattern = "\\.pdf$", full.names = TRUE)

# Quick sanity check
length(men_pdfs)  # Number of men's PDFs
## [1] 33
length(women_pdfs)  # Number of women's PDFs
## [1] 31

Inference: We now have access to all PDFs; each will be processed in the next step.

2. Clean and Process Data

Relay events and non-finishers (DQ, DNS, DNF, NS) are removed.

process_swimming_pdfs <- function(pdf_list) {
  result_df <- data.frame()
  for (file in pdf_list) {
    message("Processing: ", file)
    raw_results <- read_results(file)
    parsed <- swim_parse(raw_results)
    parsed <- parsed %>% mutate(Relay = str_detect(Event, regex("Relay", ignore_case = TRUE)))
    cleaned <- parsed %>% filter(Relay == FALSE, !str_detect(Place, "DQ|DSQ|DNS|DNF|NS"))
    result_df <- bind_rows(result_df, cleaned)
  }
  return(result_df)
}

mens_individual_df <- process_swimming_pdfs(men_pdfs) %>% mutate(Gender = "Men")
womens_individual_df <- process_swimming_pdfs(women_pdfs) %>% mutate(Gender = "Women")

combined_df <- bind_rows(mens_individual_df, womens_individual_df)

Inference: We now have a combined clean dataset, ready for analysis.

3. Convert Times and Remove Outliers

combined_df <- combined_df %>%
  mutate(Time_sec = sec_format(Finals)) %>%
  filter(!is.na(Time_sec)) %>%
  group_by(Event, Gender) %>%
  mutate(Q1 = quantile(Time_sec, 0.25),
         Q3 = quantile(Time_sec, 0.75),
         IQR = Q3 - Q1) %>%
  filter(Time_sec >= Q1 - 1.5 * IQR, Time_sec <= Q3 + 1.5 * IQR) %>%
  ungroup()

# Order events by median time (fastest to slowest)
event_order <- combined_df %>%
  group_by(Event) %>%
  summarize(median_time = median(Time_sec, na.rm = TRUE)) %>%
  arrange(median_time) %>%
  pull(Event)

combined_df$Event <- factor(combined_df$Event, levels = event_order)

Inference: Outliers removed using IQR, events are now ordered for meaningful visualization.

4. Exploratory Visualizations

Violin Plot

ggplot(combined_df, aes(x = Event, y = Time_sec, fill = Gender)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8) +
  geom_boxplot(width = 0.1, position = position_dodge(width = 0.9), outlier.shape = NA, alpha = 0.3) +
  theme_minimal() +
  coord_flip() +
  labs(title = "Distribution of Final Times by Event and Gender",
       subtitle = "Tokyo 2020 Olympic Swimming Events",
       x = "Swimming Event", y = "Final Time (seconds)", fill = "Gender") +
  theme(axis.text.y = element_text(size = 9))

Inference: Violin plots visualize the distribution of final times per event. Boxplots show median and IQR.

Jitter Plot

ggplot(combined_df, aes(x = Event, y = Time_sec, color = Gender)) +
  geom_jitter(width = 0.2, alpha = 0.6) +
  coord_flip() +
  labs(title = "Individual Final Times by Event and Gender", x = "Event", y = "Final Time (sec)", color = "Gender") +
  theme_minimal()

Histogram of 100m Freestyle

ggplot(combined_df %>% filter(Event %in% c("Men's 100m Freestyle", "Women's 100m Freestyle")),
       aes(x = Time_sec, fill = Gender)) +
  geom_histogram(alpha = 0.6, bins = 20, position = "identity") +
  facet_wrap(~ Event) +
  labs(title = "Histogram of Final Times for 100m Freestyle by Gender",
       x = "Final Time (sec)", y = "Count", fill = "Gender") +
  theme_minimal()

Inference: Histograms reveal spread of performances, highlighting gender differences and event characteristics.

5. Event-Specific Linear Regression

We examine Women’s 400m and Men’s 1500m Freestyle to explore the effect of reaction time.

women_400_free <- combined_df %>% filter(Event == "Women's 400m Freestyle", !is.na(Reaction_Time), !is.na(Time_sec)) %>% mutate(Reaction_Time = as.numeric(as.character(Reaction_Time))) %>% select(Reaction_Time, Time_sec)
men_1500_free <- combined_df %>% filter(Event == "Men's 1500m Freestyle", !is.na(Reaction_Time), !is.na(Time_sec)) %>% mutate(Reaction_Time = as.numeric(as.character(Reaction_Time))) %>% select(Reaction_Time, Time_sec)

Linear Regression and Evaluation

set.seed(123)
split_data <- function(data) {
  n <- nrow(data)
  if(n < 2) return(list(train = data, test = data))
  train_indices <- sample(1:n, size = round(0.7 * n))
  train <- data[train_indices, ]
  test <- data[-train_indices, ]
  return(list(train = train, test = test))
}

women_split <- split_data(women_400_free)
men_split <- split_data(men_1500_free)

women_model <- lm(Time_sec ~ Reaction_Time, data = women_split$train)
men_model <- lm(Time_sec ~ Reaction_Time, data = men_split$train)

evaluate_model <- function(model, train, test) {
  pred_train <- predict(model, train)
  pred_test <- predict(model, test)
  train_mae <- if(nrow(train)>0) mae(train$Time_sec, pred_train) else NA
  test_mae <- if(nrow(test)>0) mae(test$Time_sec, pred_test) else NA
  train_rmse <- if(nrow(train)>0) rmse(train$Time_sec, pred_train) else NA
  test_rmse <- if(nrow(test)>0) rmse(test$Time_sec, pred_test) else NA
  r2 <- summary(model)$r.squared
  return(list(Train_MAE = train_mae, Test_MAE = test_mae, Train_RMSE = train_rmse, Test_RMSE = test_rmse, R_squared = r2))
}

women_metrics <- evaluate_model(women_model, women_split$train, women_split$test)
men_metrics <- evaluate_model(men_model, men_split$train, men_split$test)

cat("400m Women's Freestyle Metrics\n")
## 400m Women's Freestyle Metrics
print(women_metrics)
## $Train_MAE
## [1] 3.257236
## 
## $Test_MAE
## [1] 5.114311
## 
## $Train_RMSE
## [1] 4.274523
## 
## $Test_RMSE
## [1] 6.464109
## 
## $R_squared
## [1] 0.03511878
cat("\n1500m Men's Freestyle Metrics\n")
## 
## 1500m Men's Freestyle Metrics
print(men_metrics)
## $Train_MAE
## [1] 9.150675
## 
## $Test_MAE
## [1] 11.20515
## 
## $Train_RMSE
## [1] 11.58942
## 
## $Test_RMSE
## [1] 13.43432
## 
## $R_squared
## [1] 0.01843707

Inference: Linear regression shows how reaction time correlates with final performance. MAE, RMSE, and R² provide model evaluation metrics.

6. Multiple Regression Across All Events

full_individual_df <- bind_rows(mens_individual_df, womens_individual_df) %>% mutate(Time_sec = sec_format(Finals)) %>% filter(!is.na(Time_sec)) %>% mutate(Distance = as.numeric(str_extract(Event, "\\d+")), Stroke = case_when(str_detect(Event, "Freestyle") ~ "Freestyle", str_detect(Event, "Backstroke") ~ "Backstroke", str_detect(Event, "Breaststroke") ~ "Breaststroke", str_detect(Event, "Butterfly") ~ "Butterfly", str_detect(Event, "Medley") ~ "Medley", TRUE ~ "Other"), Mean_Split = Time_sec / (Distance / 50)) %>% filter(!is.na(Distance), !is.na(Stroke), !is.na(Mean_Split))

set.seed(123)
n <- nrow(full_individual_df)
train_indices <- sample(1:n, size = round(0.7 * n))
train_df <- full_individual_df[train_indices, ]
test_df <- full_individual_df[-train_indices, ]

train_df$Gender <- as.factor(train_df$Gender)
test_df$Gender <- as.factor(test_df$Gender)
train_df$Stroke <- as.factor(train_df$Stroke)
test_df$Stroke <- as.factor(test_df$Stroke)

mvmod <- lm(Mean_Split ~ Gender + Distance + Stroke, data = train_df)
summary(mvmod)
## 
## Call:
## lm(formula = Mean_Split ~ Gender + Distance + Stroke, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.267  -1.223   0.160   1.301  12.558 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        27.5260764  0.1990464 138.290  < 2e-16 ***
## GenderWomen         2.8654513  0.1443893  19.845  < 2e-16 ***
## Distance            0.0042584  0.0002177  19.564  < 2e-16 ***
## StrokeBreaststroke  3.4196101  0.2469894  13.845  < 2e-16 ***
## StrokeButterfly    -0.8851302  0.2612458  -3.388 0.000732 ***
## StrokeFreestyle    -2.9706668  0.2134983 -13.914  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.254 on 989 degrees of freedom
## Multiple R-squared:  0.6208, Adjusted R-squared:  0.6189 
## F-statistic: 323.8 on 5 and 989 DF,  p-value: < 2.2e-16
train_pred <- predict(mvmod, train_df)
test_pred <- predict(mvmod, test_df)
mvmod_metrics <- list(Train_MAE = mae(train_df$Mean_Split, train_pred), Test_MAE = mae(test_df$Mean_Split, test_pred), Train_RMSE = rmse(train_df$Mean_Split, train_pred), Test_RMSE = rmse(test_df$Mean_Split, test_pred), R_squared = summary(mvmod)$r.squared)
print(mvmod_metrics)
## $Train_MAE
## [1] 1.512097
## 
## $Test_MAE
## [1] 1.570649
## 
## $Train_RMSE
## [1] 2.2472
## 
## $Test_RMSE
## [1] 2.484317
## 
## $R_squared
## [1] 0.6208042

Inference: Multiple regression identifies how gender, stroke type, and distance influence mean split times across all events.

7. Example Predictions

predict_100m_butterfly <- data.frame(Gender = "Men", Distance = 100, Stroke = "Butterfly")
predict_5000m_freestyle <- data.frame(Gender = "Women", Distance = 5000, Stroke = "Freestyle")
pred_butterfly_100m <- round(predict(mvmod, predict_100m_butterfly), 2)
pred_freestyle_5000m <- round(predict(mvmod, predict_5000m_freestyle), 2)
cat("Predicted mean split time for Men's 100m Butterfly: ", pred_butterfly_100m, "seconds\n")
## Predicted mean split time for Men's 100m Butterfly:  27.07 seconds
cat("Predicted mean split time for Women's 5000m Freestyle: ", pred_freestyle_5000m, "seconds\n")
## Predicted mean split time for Women's 5000m Freestyle:  48.71 seconds

8. Athlete Split Visualization — Ariarne Titmus

finals_data <- data.frame(Split_m = c(50,100,150,200,250,300,350,400), Titmus_time=c(27.88,57.74,87.83,118.10,148.12,178.27,208.02,236.69), Closest_time=c(28.01,57.67,87.34,117.44,147.46,178.11,208.24,237.36), Competitor="Katie Ledecky", Round="Final") %>% mutate(Diff=Titmus_time-Closest_time)
heats_data <- data.frame(Split_m = c(50,100,150,200,250,300,350,400), Titmus_time=c(27.48,57.77,88.30,118.99,149.40,179.86,210.53,240.24), Closest_time=c(27.87,58.47,89.34,120.54,151.76,182.98,214.88,246.11), Competitor="Tamsin Fairweather", Round="Heats") %>% mutate(Diff=Titmus_time-Closest_time)
plot_data <- bind_rows(heats_data, finals_data)

ggplot(plot_data, aes(x=Split_m, y=Diff, color=Round, label=Competitor)) +
  geom_line(size=1.2) +
  geom_point(size=3) +
  geom_hline(yintercept=0, linetype="dashed", color="gray") +
  geom_text(vjust=-1, size=3) +
  labs(title="Ariarne Titmus’ Lead or Lag vs. Closest Competitor at Each 50m Split", subtitle="Tokyo 2020 Women's 400m Freestyle", x="Distance (m)", y="Time Difference (seconds, negative = leading)", color="Round") +
  theme_minimal()

Inference: Split visualization demonstrates how Titmus maintained or increased her lead across the race.

Conclusion

The analysis of Tokyo 2020 Olympic swimming results revealed several key insights:

Overall, the data-driven analysis underscores that race distance, stroke type, and split management are key determinants of Olympic swimming outcomes, and even small improvements in reaction time can have measurable effects on performance.