Code Transparency Statement This document contains the complete analysis code for the development of a clinical risk prediction tool for atrial fibrillation (AF) in endurance athletes using multimodal atrial myopathy markers. All code is provided in the interest of transparency, reproducibility, and to facilitate independent verification of our findings. Researchers are welcome to use and adapt this code for their own studies.

Corresponding Author: Professor Andre La Gerche:

Citation: Spencer LW, D’Ambrosio P, et al. Using atrial myopathy metrics to devise a risk prediction tool for atrial fibrillation in endurance athletes. Manuscript in submission for European Heart Journal.

AF risk prediction model: We have developed a AF risk prediction tool available at: https://heart-lab.org/atrial-fibrillation

1 Analysis Setup and Data Preparation

1.1 R packages

# Data manipulation
library(tidyverse)
library(dplyr)
library(readr)

# Statistical analysis
library(tableone)
library(broom)
library(pROC)
library(ResourceSelection)

# Visualisation
library(ggplot2)
library(ggpubr)
library(gridExtra)
library(scales)
library(patchwork)
library(viridis)
library(fmsb)

# Output
library(openxlsx)
library(writexl)
library(knitr)
library(kableExtra)

# Set options
options(scipen = 999)  # To avoid scientific notation
set.seed(123)  # Reproducibility

1.2 Filter and Create Analysis Cohorts

# Remove Atrial Myopathy related pathology
NoCM <- filter(AM_paper, `AM related pathology` != 1)

# Main athlete cohort (excluding young/middle-age controls)
Athletes_NoCM <- filter(NoCM, 
                        Cohort != "young control" & 
                        Cohort != "middle age control")

# Age groups and AF status variables
Athletes_NoCM <- Athletes_NoCM %>%
  mutate(
    Age = as.numeric(Age),
    AgeGroup = factor(ifelse(Age <= 45, "≤45", ">45"), 
                     levels = c("≤45", ">45")),
    AFgroup = factor(ifelse(AF == 1, "AF+", "AF-"), 
                    levels = c("AF-", "AF+")),
    AF_numeric = as.numeric(AF)
  )

# Control and athlete groups for supplementary analysis
controls <- filter(NoCM, EICR == "Control")
athletes <- filter(NoCM, EICR == "Athlete")

# Cohort summary table
cohort_summary <- data.frame(
  Group = c("Athletes (excluding cardiomyopathy)", 
            "  AF+", 
            "  AF-",
            "  Age ≤45 years",
            "  Age >45 years",
            "Controls"),
  N = c(
    nrow(Athletes_NoCM),
    sum(Athletes_NoCM$AF == 1, na.rm = TRUE),
    sum(Athletes_NoCM$AF == 0, na.rm = TRUE),
    sum(Athletes_NoCM$AgeGroup == "≤45", na.rm = TRUE),
    sum(Athletes_NoCM$AgeGroup == ">45", na.rm = TRUE),
    nrow(controls)
  )
)

kable(cohort_summary,
      col.names = c("Cohort Description", "N"),
      align = c("l", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = FALSE)
Cohort Description N
Athletes (excluding cardiomyopathy) 353
AF+ 96
AF- 257
Age ≤45 years 201
Age >45 years 152
Controls 97

1.3 Variable Definitions

# Table 1 variables
table1_vars <- c(
  # Demographics
  "Age", "sex", "bmi", "Bsa", "averagehr1", "minhr1", "brad_burden_pct1",
  "htn", "rest_SBP", "rest_DBP", "high_cholesterol", "diabetes",
  "ex_smoker", "etoh sd/wk",
  # Medications
  "bblocker", "nondihydropyridine ccb", "flec", "sotalol", "amio",
  # Exercise
  "vo2_kg_peak", "vo2_absolute", "Total MET hrs/wk", "Exercise Duration (yr)",
  # Sport
   "Rowing", "Cycling", "Running", "Other"
)

# Table 2 variables
table2_vars <- c(
  # LV
  "enddiastolic_volume_ml_lv", "endsystolic_volume_ml_lv", 
  "lvef_bip_q", "g_peak_slavg_",
    # LA
    "laesv_bip_ml", "LAVi", "la/lv",
    "LA Resevoir Strain", "LA Conduit Strain", "LA Contractile Strain",
    # RV
    "RVEDV", "RVESV", "RVEF",
    # Doppler
    "MV E Vel (m/s)", "MV A Vel (m/s)", "MV E/A Ratio", 
    "Sept E' (cm/s)", "E/E'",
    # ECG
    "P wave Duration", "P Wave Axis", "PTFV1 (ms*mV)", "Adv IAB",
    "ECTOPIES LS",
    # Biomarkers
    "bnp", "troponin1",
    # Genetic
    "AF centile", "highest quartile", "highest decile"
  )
  
  #Sport
  Athletes_NoCM <- Athletes_NoCM %>%
    mutate(
      Rowing = as.numeric(`Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other` == 1),
      Cycling = as.numeric(`Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other` == 2),
      Running = as.numeric(`Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other` == 3),
      Other = as.numeric(`Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other` %in% c(4, 5, 6, 7))
    )
  
  # Supplementary variables
  supp_vars <- c(
    "% predicted vo2", "peak rer",
    "LAVi > 34", "LAVi > 48", "LAVi > 60",
    "Prevalence of P-wave duration >120 ms", "High PTFV1",
    "Prevalence of >=100 PACs/24 h",
    "AF PRS 6238158var PGS000016", "highest quintile", "98th percentile",
    "Rare Variant"
  )
  
  all_variables <- unique(c(table1_vars, table2_vars, supp_vars))
  all_variables <- all_variables[all_variables %in% names(Athletes_NoCM)]
  
  # Non-normal distributions
  nonnormal_vars <- c("Age", "bmi", "Bsa", "rest_SBP", "rest_DBP", "minhr1",
                      "brad_burden_pct1", "Exercise Duration (yr)", "Total MET hrs/wk",
                      "laesv_bip_ml", "LAVi", "la/lv", "P wave Duration", "PTFV1 (ms*mV)",
                      "ECTOPIES LS", "bnp", "troponin1", "AF centile", 
                      "AF PRS 6238158var PGS000016", "etoh sd/wk")
  
  # Categorical variables
  categorical_vars <- c("sex", "htn", "high_cholesterol", "diabetes", "ex_smoker",
                        "bblocker", "nondihydropyridine ccb", "flec", "sotalol", "amio",
                        "Rowing", "Cycling", "Running", "Other", 
                        "LAVi > 34", "LAVi > 48", "LAVi > 60",
                        "Prevalence of P-wave duration >120 ms", "High PTFV1",
                        "Prevalence of >=100 PACs/24 h",
                        "Adv IAB", "highest quartile", "highest quintile", 
                        "highest decile", "98th percentile", "Rare Variant")

1.4 Statistical Functions

  calculate_p_value <- function(data, var, group_var, is_categorical = FALSE, is_nonnormal = NULL) {
    if (!var %in% names(data) || !group_var %in% names(data)) return(NA)
    
    clean_data <- data %>% filter(!is.na(.data[[var]]) & !is.na(.data[[group_var]]))
    if (nrow(clean_data) < 3) return(NA)
    
    group_levels <- unique(clean_data[[group_var]])
    if (length(group_levels) < 2) return(NA)
    
    tryCatch({
      if (is_categorical) {
        tab <- table(as.factor(clean_data[[var]]), as.factor(clean_data[[group_var]]))
        if (any(dim(tab) < 2) || sum(tab) < 5) return(NA)
        
        if (any(tab < 5)) {
          test_result <- fisher.test(tab, simulate.p.value = TRUE)
        } else {
          test_result <- chisq.test(tab)
        }
        return(test_result$p.value)
      } else {
        var_numeric <- as.numeric(clean_data[[var]])
        if (all(is.na(var_numeric))) return(NA)
        
        if (is.null(is_nonnormal)) {
          n <- length(var_numeric[!is.na(var_numeric)])
          if (n >= 3 && n <= 5000) {
            shapiro_result <- tryCatch(shapiro.test(var_numeric),
                                      error = function(e) list(p.value = 0.049))
            is_nonnormal <- shapiro_result$p.value < 0.05
          } else {
            is_nonnormal <- TRUE
          }
        }
        
        if (is_nonnormal) {
          test_result <- wilcox.test(var_numeric ~ clean_data[[group_var]])
        } else {
          test_result <- t.test(var_numeric ~ clean_data[[group_var]])
        }
        return(test_result$p.value)
      }
    }, error = function(e) NA)
  }
  
  calculate_interaction_p <- function(data, var, is_categorical = FALSE) {
    required_vars <- c(var, "AF_numeric", "AgeGroup")
    if (!all(required_vars %in% names(data))) return(NA)
    
    clean_data <- data %>% 
      filter(!is.na(.data[[var]]) & !is.na(AF_numeric) & !is.na(AgeGroup))
    if (nrow(clean_data) < 20) return(NA)
    
    tryCatch({
      if (is_categorical) {
        model <- glm(as.factor(clean_data[[var]]) ~ AgeGroup * AFgroup,
                    data = clean_data, family = binomial())
      } else {
        model <- lm(as.numeric(clean_data[[var]]) ~ AgeGroup * AFgroup, 
                   data = clean_data)
      }
      
      anova_result <- anova(model)
      int_row <- grep(":", rownames(anova_result))
      if (length(int_row) > 0) {
        p_col <- which(names(anova_result) %in% c("Pr(>F)", "Pr(>Chi)"))
        if (length(p_col) > 0) return(anova_result[int_row[1], p_col[1]])
      }
      return(NA)
    }, error = function(e) NA)
  }
  
  format_stats <- function(data, var, is_cat, is_nonnorm) {
    if (!var %in% names(data) || nrow(data) == 0) return("--")
    
    tryCatch({
      if (is_cat) {
        var_data <- data[[var]]
        
        if (is.logical(var_data)) {
          count <- sum(var_data == TRUE, na.rm = TRUE)
          total <- sum(!is.na(var_data))
        } else if (is.numeric(var_data) && all(var_data %in% c(0, 1, NA), na.rm = TRUE)) {
          count <- sum(var_data == 1, na.rm = TRUE)
          total <- sum(!is.na(var_data))
        } else {
          tab <- table(var_data, useNA = "no")
          if (length(tab) > 0) {
            count <- max(tab)
            total <- sum(tab)
          } else return("0 (0%)")
        }
        
        if (total == 0) return("0 (0%)")
        return(paste0(count, " (", round(100*count/total, 1), "%)"))
      } else {
        values <- as.numeric(data[[var]])
        values <- values[!is.na(values)]
        if (length(values) == 0) return("--")
        
        decimals <- ifelse(var == "la/lv", 2, 1)
        
        if (is_nonnorm) {
          med <- median(values)
          q1 <- quantile(values, 0.25)
          q3 <- quantile(values, 0.75)
          return(paste0(round(med, decimals), " [", 
                       round(q1, decimals), "-", round(q3, decimals), "]"))
        } else {
          m <- mean(values)
          s <- sd(values)
          return(paste0(round(m, decimals), " ± ", round(s, decimals)))
        }
      }
    }, error = function(e) "--")
  }
  
  format_p_value <- function(p) {
    if (is.null(p) || length(p) == 0 || is.na(p)) return("--")
    p_num <- as.numeric(p)
    if (is.na(p_num)) return("--")
    if (p_num < 0.001) return("<0.001")
    if (p_num < 0.01) return(sprintf("%.3f", p_num))
    return(sprintf("%.3f", p_num))
  }
  
  calculate_sport_p_value <- function(data, group_var) {
    sport_var <- "Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other"
    
    if (!sport_var %in% names(data) || !group_var %in% names(data)) return(NA)
    
    clean_data <- data %>% 
      filter(!is.na(.data[[sport_var]]) & !is.na(.data[[group_var]]))
    
    if (nrow(clean_data) < 3) return(NA)
    
    tryCatch({
      
      clean_data$sport_collapsed <- ifelse(
        clean_data[[sport_var]] %in% c(4, 5, 6, 7), 
        4, 
        clean_data[[sport_var]]
      )
      
      tab <- table(clean_data$sport_collapsed, clean_data[[group_var]])
      
      if (any(dim(tab) < 2)) return(NA)
      
      if (any(tab < 5)) {
        test_result <- fisher.test(tab, simulate.p.value = TRUE)
      } else {
        test_result <- chisq.test(tab)
      }
      return(test_result$p.value)
    }, error = function(e) NA)
  }

1.5 Helper Functions

  assign_category <- function(var) {
    # Table 1 categories
    if (var %in% c("Age", "sex", "bmi", "Bsa", "averagehr1", "minhr1", 
                   "brad_burden_pct1", "htn", "rest_SBP", "rest_DBP",
                   "high_cholesterol", "diabetes", "ex_smoker", "etoh sd/wk")) {
      return("Demographics")
    } else if (var %in% c("bblocker", "nondihydropyridine ccb", "flec", 
                          "sotalol", "amio")) {
      return("Anti-arrhythmic Medication use, n (%)")
    } else if (var %in% c("vo2_kg_peak", "vo2_absolute", "Total MET hrs/wk", 
                          "Exercise Duration (yr)")) {
      return("Exercise Capacity")
    } else if (var == "Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other" ||
               var %in% c("Rowing", "Cycling", "Running", "Other")) {
      return("Primary Sport, n (%)")
    } else if (var %in% c("enddiastolic_volume_ml_lv", "endsystolic_volume_ml_lv",
                          "lvef_bip_q", "g_peak_slavg_")) {
      return("Left ventricular parameters")
    } else if (var %in% c("laesv_bip_ml", "LAVi", "la/lv", "LAVi > 34", 
                          "LAVi > 48", "LAVi > 60",
                          "LA Resevoir Strain", "LA Conduit Strain", 
                          "LA Contractile Strain")) {
      return("Left atrial parameters")
    } else if (var %in% c("RVEDV", "RVESV", "RVEF")) {
      return("Right Ventricular Parameters")
    } else if (var %in% c("MV E Vel (m/s)", "MV A Vel (m/s)", "MV E/A Ratio",
                          "Sept E' (cm/s)", "E/E'")) {
      return("Doppler Parameters")
    } else if (var %in% c("P wave Duration", "P Wave Axis", "PTFV1 (ms*mV)",
                          "Prevalence of P-wave duration >120 ms", "High PTFV1",
                          "Adv IAB", "ECTOPIES LS")) {
      return("Electrocardiographic")
    } else if (var %in% c("bnp", "troponin1")) {
      return("Biochemical")
    } else if (var %in% c("AF centile", "AF PRS 6238158var PGS000016",
                          "highest quartile", "highest quintile", "highest decile",
                          "98th percentile", "Rare Variant")) {
      return("Genetic")
    } else {
      return("Other")
    }
  }
  
  clean_variable_names <- function(var) {
    case_when(
      var == "Age" ~ "Age, years",
      var == "sex" ~ "Male sex, n (%)",
      var == "bmi" ~ "BMI, kg/m²",
      var == "Bsa" ~ "BSA, m²",
      var == "averagehr1" ~ "Average HR, bpm",
      var == "minhr1" ~ "Minimum HR, bpm",
      var == "brad_burden_pct1" ~ "Bradycardia burden, % (24h)",
      var == "htn" ~ "Hypertension, n (%)",
      var == "rest_SBP" ~ "SBP, mmHg",
      var == "rest_DBP" ~ "DBP, mmHg",
      var == "high_cholesterol" ~ "Dyslipidaemia, n (%)",
      var == "diabetes" ~ "Diabetes, n (%)",
      var == "ex_smoker" ~ "History of smoking, n (%)",
      var == "etoh sd/wk" ~ "Alcohol, drinks/week",
      var == "bblocker" ~ "Beta-blockers",
      var == "nondihydropyridine ccb" ~ "Non-dihydropyridine CCB's",
      var == "flec" ~ "Flecainide",
      var == "sotalol" ~ "Sotalol",
      var == "amio" ~ "Amiodarone",
      var == "vo2_kg_peak" ~ "VO₂peak, mL/kg/min",
      var == "vo2_absolute" ~ "VO₂peak, L/min",
      var == "% predicted vo2" ~ "% of predicted exercise capacity",
      var == "peak rer" ~ "Respiratory exchange ratio",
      var == "Total MET hrs/wk" ~ "Total weekly MET-hours",
      var == "Exercise Duration (yr)" ~ "Yrs of endurance training",
      var == "Rowing" ~ "Rowing",
      var == "Cycling" ~ "Cycling",
      var == "Running" ~ "Running",
      var == "Other" ~ "Other",
      var == "Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other" ~ "Primary Sport, n (%)",
      var == "enddiastolic_volume_ml_lv" ~ "LVEDV, mL",
      var == "endsystolic_volume_ml_lv" ~ "LVESV, mL",
      var == "lvef_bip_q" ~ "LVEF, %",
      var == "g_peak_slavg_" ~ "LV strain, %",
      var == "laesv_bip_ml" ~ "LAESV, mL",
      var == "LAVi" ~ "LAVi, mL/m²",
      var == "LAVi > 34" ~ "Mildly abnormal >34 mL/m², n (%)",
      var == "LAVi > 48" ~ "Severely abnormal >48 mL/m², n (%)",
      var == "LAVi > 60" ~ "Extremely abnormal >60 mL/m², n (%)",
      var == "la/lv" ~ "LA:LV",
      var == "LA Resevoir Strain" ~ "Reservoir strain, %",
      var == "LA Conduit Strain" ~ "Conduit strain, %",
      var == "LA Contractile Strain" ~ "Contractile strain, %",
      var == "RVEDV" ~ "RVEDV, mL",
      var == "RVESV" ~ "RVESV, mL",
      var == "RVEF" ~ "RVEF, %",
      var == "MV E Vel (m/s)" ~ "E velocity, m/s",
      var == "MV A Vel (m/s)" ~ "A velocity, m/s",
      var == "MV E/A Ratio" ~ "E/A ratio",
      var == "Sept E' (cm/s)" ~ "E′ velocity, cm/s",
      var == "E/E'" ~ "E/E' ratio",
      var == "P wave Duration" ~ "P wave duration, ms",
      var == "Prevalence of P-wave duration >120 ms" ~ "P-wave duration >120 ms, n (%)",
      var == "P Wave Axis" ~ "P wave axis, °",
      var == "PTFV1 (ms*mV)" ~ "PTFV₁, mV·ms",
      var == "High PTFV1" ~ "PTFV₁ ≥4 mV·ms, n (%)",
      var == "Adv IAB" ~ "Advanced IAB, n (%)",
      var == "ECTOPIES LS" ~ "Atrial ectopy, per 24 h",
      var == "Prevalence of >=100 PACs/24 h" ~ "≥100 PACs per 24 h, n (%)",
      var == "bnp" ~ "BNP, pg/mL",
      var == "troponin1" ~ "cTnI, ng/L",
      var == "AF centile" ~ "AF-PRS percentile",
      var == "AF PRS 6238158var PGS000016" ~ "AF-PRS",
      var == "highest quartile" ~ "AF-PRS highest quartile",
      var == "highest quintile" ~ "AF-PRS highest quintile",
      var == "highest decile" ~ "AF-PRS highest decile",
      var == "98th percentile" ~ "AF-PRS ≥98th percentile",
      var == "Rare Variant" ~ "Rare Variant",
      TRUE ~ var
    )
  }

1.6 Table Setup

results_table <- data.frame(Variable = character(), stringsAsFactors = FALSE)

for (var in all_variables) {
  if (!var %in% names(Athletes_NoCM)) next
  
  is_cat <- var %in% categorical_vars
  is_nonnorm <- var %in% nonnormal_vars
  
  af_overall <- Athletes_NoCM %>% filter(AF == 1)
  noaf_overall <- Athletes_NoCM %>% filter(AF == 0)
  
  young_data <- Athletes_NoCM %>% filter(AgeGroup == "≤45")
  af_young <- young_data %>% filter(AF == 1)
  noaf_young <- young_data %>% filter(AF == 0)
  
  old_data <- Athletes_NoCM %>% filter(AgeGroup == ">45")
  af_old <- old_data %>% filter(AF == 1)
  noaf_old <- old_data %>% filter(AF == 0)
  
  # Calculate p-values - Sport
  if (var %in% c("Rowing", "Cycling", "Running", "Other")) {
    if (var == "Rowing") {
      p_overall <- format_p_value(calculate_sport_p_value(Athletes_NoCM, "AFgroup"))
      p_young <- format_p_value(calculate_sport_p_value(young_data, "AFgroup"))
      p_old <- format_p_value(calculate_sport_p_value(old_data, "AFgroup"))
      p_interaction <- format_p_value(calculate_interaction_p(Athletes_NoCM, 
        "Sport: 1=row, 2=cyc, 3=run, 4=CCS, 5=swim, 6=tri, 7=other", is_cat = TRUE))
    } else {
      p_overall <- ""
      p_young <- ""
      p_old <- ""
      p_interaction <- ""
    }
  } else {
    p_overall <- format_p_value(calculate_p_value(Athletes_NoCM, var, "AFgroup", is_cat, is_nonnorm))
    p_young <- format_p_value(calculate_p_value(young_data, var, "AFgroup", is_cat, is_nonnorm))
    p_old <- format_p_value(calculate_p_value(old_data, var, "AFgroup", is_cat, is_nonnorm))
    p_interaction <- format_p_value(calculate_interaction_p(Athletes_NoCM, var, is_cat))
  }
  
  results_table <- rbind(results_table, data.frame(
    Variable = var,
    AF_Overall = format_stats(af_overall, var, is_cat, is_nonnorm),
    NoAF_Overall = format_stats(noaf_overall, var, is_cat, is_nonnorm),
    P_Overall = p_overall,
    AF_Young = format_stats(af_young, var, is_cat, is_nonnorm),
    NoAF_Young = format_stats(noaf_young, var, is_cat, is_nonnorm),
    P_Young = p_young,
    AF_Old = format_stats(af_old, var, is_cat, is_nonnorm),
    NoAF_Old = format_stats(noaf_old, var, is_cat, is_nonnorm),
    P_Old = p_old,
    P_Interaction = p_interaction,
    stringsAsFactors = FALSE
  ))
}

results_table$Category <- sapply(results_table$Variable, assign_category)
results_table$Variable <- sapply(results_table$Variable, clean_variable_names)

n_af_overall <- sum(Athletes_NoCM$AF == 1, na.rm = TRUE)
n_noaf_overall <- sum(Athletes_NoCM$AF == 0, na.rm = TRUE)
n_af_young <- sum(Athletes_NoCM$AF == 1 & Athletes_NoCM$AgeGroup == "≤45", na.rm = TRUE)
n_noaf_young <- sum(Athletes_NoCM$AF == 0 & Athletes_NoCM$AgeGroup == "≤45", na.rm = TRUE)
n_af_old <- sum(Athletes_NoCM$AF == 1 & Athletes_NoCM$AgeGroup == ">45", na.rm = TRUE)
n_noaf_old <- sum(Athletes_NoCM$AF == 0 & Athletes_NoCM$AgeGroup == ">45", na.rm = TRUE)

# Define category orders
category_order_table1 <- c("Demographics", "Anti-arrhythmic Medication use, n (%)", 
                           "Exercise Capacity", "Primary Sport, n (%)")
category_order_table2 <- c("Left ventricular parameters", "Left atrial parameters",
                           "Right Ventricular Parameters", "Doppler Parameters",
                           "Electrocardiographic", "Biochemical", "Genetic")

# Ordered category tables
table1_with_cat <- results_table %>% 
  filter(Category %in% category_order_table1) %>%
  mutate(Category = factor(Category, levels = category_order_table1, ordered = TRUE)) %>%
  arrange(Category)

table2_with_cat <- results_table %>% 
  filter(Category %in% category_order_table2) %>%
  mutate(Category = factor(Category, levels = category_order_table2, ordered = TRUE)) %>%
  arrange(Category)

# Create display tables
table_1 <- table1_with_cat %>% select(-Category)
table_2 <- table2_with_cat %>% select(-Category)

# Rename columns
colnames(table_1) <- c(
  "Characteristic",
  paste0("AF (n=", n_af_overall, ")"),
  paste0("No-AF (n=", n_noaf_overall, ")"),
  "p value",
  paste0("AF (n=", n_af_young, ")"),
  paste0("No-AF (n=", n_noaf_young, ")"),
  "p value ",
  paste0("AF (n=", n_af_old, ")"),
  paste0("No-AF (n=", n_noaf_old, ")"),
  "p value  ",
  "Interaction p value"
)

colnames(table_2) <- colnames(table_1)

cat("\n# Table 1\n\n")

2 Table 1

kable(table_1, 
      caption = "Table 1. Baseline Characteristics of Endurance Athletes Stratified by Age and Atrial Fibrillation Status",
      align = c("l", rep("c", ncol(table_1)-1))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 10) %>%
  add_header_above(c(" " = 1, 
                     "Overall" = 3, 
                     "Younger Athletes (≤45 years)" = 3, 
                     "Older Athletes (>45 years)" = 3, 
                     " " = 1)) %>%
  pack_rows(index = table(table1_with_cat$Category)) %>%
  add_footnote(c("Data are presented as mean ± SD, median (IQR), or n (%)",
                 "P-values from Wilcoxon rank-sum test (continuous) or chi-square/Fisher exact test (categorical)",
                 "Interaction p-values derived from two-way ANOVA testing for age×AF interaction effects"),
               notation = "none")
Table 1. Baseline Characteristics of Endurance Athletes Stratified by Age and Atrial Fibrillation Status
Overall
Younger Athletes (≤45 years)
Older Athletes (>45 years)
Characteristic AF (n=96) No-AF (n=257) p value AF (n=25) No-AF (n=176) p value AF (n=71) No-AF (n=81) p value Interaction p value
Demographics
Age, years 60 [45-68] 31 [20-51] <0.001 35 [30-43] 22 [18.8-32] <0.001 64 [57-68.5] 60 [52-66] 0.009 0.026
Male sex, n (%) 89 (92.7%) 173 (67.3%) <0.001 21 (84%) 120 (68.2%) 0.159 68 (95.8%) 53 (65.4%) <0.001 0.063
BMI, kg/m² 24.6 [23.5-26.7] 23 [21.2-24.9] <0.001 24.5 [23.8-25.6] 22.1 [20.5-23.6] <0.001 24.8 [23.4-27.4] 25 [23.3-27.4] 0.762 <0.001
BSA, m² 2 [1.9-2.1] 1.9 [1.8-2] <0.001 2 [1.9-2.1] 1.9 [1.7-2] <0.001 2 [1.9-2.1] 2 [1.9-2.2] 0.638 0.004
Average HR, bpm 64.8 ± 9 66.9 ± 9.1 0.084 64.5 ± 8.3 66.6 ± 9.6 0.282 64.9 ± 9.4 67.5 ± 7.6 0.104 0.852
Minimum HR, bpm 45 [40-49] 43 [39-48] 0.332 44.5 [40.2-48.8] 42 [38-47] 0.263 45 [40-50] 46 [43-50] 0.140 0.091
Bradycardia burden, % (24h) 4.5 [0.2-23.4] 7.4 [0.4-28.3] 0.167 12.5 [0.4-26.7] 16.9 [0.8-33] 0.304 3.5 [0.2-20.2] 1.6 [0.2-8.6] 0.270 0.121
Hypertension, n (%) 14 (14.9%) 17 (6.7%) 0.029 2 (8.3%) 3 (1.7%) 0.111 12 (17.1%) 14 (17.5%) 1.000 0.127
SBP, mmHg 132 [124-143.2] 123 [115-134] <0.001 126 [122-135] 122 [113-131] 0.027 133 [126.5-145.5] 130 [119-138] 0.043 0.884
DBP, mmHg 76 [69.8-82.2] 66 [60.8-75] <0.001 70 [64-76] 64 [59-70] 0.007 78 [72-83] 75 [68-80] 0.068 0.305
Dyslipidaemia, n (%) 23 (24.5%) 22 (8.6%) <0.001 3 (12.5%) 1 (0.6%) 0.006 20 (28.6%) 21 (26.2%) 0.893 0.006
Diabetes, n (%) 2 (2.1%) 0 (0%) 0.072 0 (0%) 0 (0%) 2 (2.9%) 0 (0%) 0.216 1.000
History of smoking, n (%) 22 (23.4%) 25 (9.8%) 0.002 2 (8.3%) 4 (2.3%) 0.155 20 (28.6%) 21 (26.2%) 0.893 0.226
Alcohol, drinks/week 3 [0-7] 0 [0-5] <0.001 0.5 [0-3.5] 0 [0-0] 0.007 4.5 [1.2-8.8] 7 [4-12] 0.061 0.047
Anti-arrhythmic Medication use, n (%)
Beta-blockers 11 (11.7%) 1 (0.4%) <0.001 2 (8.3%) 0 (0%) 0.014 9 (12.9%) 1 (1.2%) 0.006 0.218
Non-dihydropyridine CCB’s 2 (50%) 0 (0%) 1.000 0 (0%) 0 (0%) 2 (50%) 0 (0%) 1.000
Flecainide 12 (12.5%) 1 (0.4%) <0.001 3 (12%) 0 (0%) 0.002 9 (12.7%) 1 (1.2%) 0.006 0.154
Sotalol 2 (2.1%) 0 (0%) 0.074 0 (0%) 0 (0%) 2 (2.8%) 0 (0%) 0.219 1.000
Amiodarone 1 (1%) 0 (0%) 0.274 0 (0%) 0 (0%) 1 (1.4%) 0 (0%) 0.470 1.000
Exercise Capacity
VO₂peak, mL/kg/min 41.3 ± 11.5 51.7 ± 14.2 <0.001 51.6 ± 8.9 58.7 ± 10.1 <0.001 37.6 ± 10 36.5 ± 8.9 0.469 0.002
VO₂peak, L/min 3378.2 ± 947.3 3778.3 ± 995.5 <0.001 4209.6 ± 809 4148.5 ± 876.5 0.729 3085.5 ± 812.3 2974 ± 731 0.378 0.821
Total weekly MET-hours 74 [45-91] 87 [55-109.5] 0.007 81 [57-106] 98 [74-122] 0.075 74 [44-86.5] 64 [41-87] 0.535 0.056
Yrs of endurance training 26 [17-36.5] 17 [8.5-26] <0.001 15 [10-18] 11 [4-18] 0.066 30 [20-40] 29 [20-39] 0.395 0.567
Primary Sport, n (%)
Rowing 43 (44.8%) 125 (48.6%) 0.127 5 (20%) 47 (26.7%) 0.342 38 (53.5%) 78 (96.3%) <0.001 <0.001
Cycling 23 (24%) 77 (30%) 9 (36%) 75 (42.6%) 14 (19.7%) 2 (2.5%)
Running 10 (10.4%) 26 (10.1%) 3 (12%) 25 (14.2%) 7 (9.9%) 1 (1.2%)
Other 20 (20.8%) 29 (11.3%) 8 (32%) 29 (16.5%) 12 (16.9%) 0 (0%)
Data are presented as mean ± SD, median (IQR), or n (%)
P-values from Wilcoxon rank-sum test (continuous) or chi-square/Fisher exact test (categorical)
Interaction p-values derived from two-way ANOVA testing for age×AF interaction effects
cat("\n# Table 2\n\n")

3 Table 2

kable(table_2, 
      caption = "Table 2. Cardiac Structure, Function, and Electrophysiological Parameters in Endurance Athletes by Age and Atrial Fibrillation Status",
      align = c("l", rep("c", ncol(table_2)-1))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 10) %>%
  add_header_above(c(" " = 1, 
                     "Overall" = 3, 
                     "Younger Athletes (≤45 years)" = 3, 
                     "Older Athletes (>45 years)" = 3, 
                     " " = 1)) %>%
  pack_rows(index = table(table2_with_cat$Category)) %>%
  add_footnote(c("Data are presented as mean ± SD, median (IQR), or n (%)",
                 "Bolded values indicate statistically significant pairwise comparisons (p<0.05)",
                 "Interaction p-values test whether AF associations differ between age groups"),
               notation = "none")
Table 2. Cardiac Structure, Function, and Electrophysiological Parameters in Endurance Athletes by Age and Atrial Fibrillation Status
Overall
Younger Athletes (≤45 years)
Older Athletes (>45 years)
Characteristic AF (n=96) No-AF (n=257) p value AF (n=25) No-AF (n=176) p value AF (n=71) No-AF (n=81) p value Interaction p value
Left ventricular parameters
LVEDV, mL 157.4 ± 40.5 170.4 ± 46.5 0.011 173.7 ± 45.3 183.3 ± 44.2 0.328 151.6 ± 37.2 142.3 ± 38.5 0.136 0.094
LVESV, mL 64 ± 17.8 69.9 ± 24.1 0.013 68.7 ± 21.1 75.5 ± 24.5 0.147 62.4 ± 16.4 57.9 ± 18.2 0.113 0.051
LVEF, % 58.1 ± 5.4 58.1 ± 5.6 0.986 60.2 ± 4.8 57.9 ± 5.7 0.040 57.4 ± 5.4 58.6 ± 5.5 0.182 0.020
LV strain, % -18.2 ± 2.8 -19.2 ± 2.1 0.001 -18.7 ± 2 -19.1 ± 2.1 0.291 -18 ± 3 -19.3 ± 2.1 0.003 0.173
Left atrial parameters
LAESV, mL 90 [77.8-118.5] 79 [66-95] <0.001 84 [73-100] 79 [66-93] 0.116 97 [78.5-122.5] 81 [66-98] <0.001 0.058
LAVi, mL/m² 46.2 [38.7-55.9] 41.9 [34.5-48.4] <0.001 41.8 [37.5-49.6] 41.9 [34.8-48.4] 0.694 47.7 [39.9-58.6] 41.9 [32.7-47.8] <0.001 0.004
LA:LV 0.59 [0.49-0.75] 0.46 [0.39-0.56] <0.001 0.5 [0.45-0.6] 0.44 [0.38-0.49] 0.001 0.64 [0.52-0.84] 0.59 [0.5-0.69] 0.034 0.391
Reservoir strain, % 25.8 ± 8.9 32.2 ± 5.9 <0.001 33.2 ± 6.3 33.6 ± 5.5 0.770 23.4 ± 8.4 29.2 ± 5.4 <0.001 0.003
Conduit strain, % 15.2 ± 6.3 20.9 ± 6.5 <0.001 21.6 ± 5.5 23.6 ± 5.2 0.127 13.1 ± 5.1 15.2 ± 5.2 0.015 0.908
Contractile strain, % 10.7 ± 5.6 11.4 ± 4.3 0.311 11.5 ± 3.3 10 ± 3.4 0.064 10.4 ± 6.2 14.1 ± 4.9 <0.001 <0.001
Mildly abnormal >34 mL/m², n (%) 87 (90.6%) 196 (76.3%) 0.004 20 (80%) 137 (77.8%) 1.000 67 (94.4%) 59 (72.8%) <0.001 0.030
Severely abnormal >48 mL/m², n (%) 41 (42.7%) 66 (25.7%) 0.003 7 (28%) 46 (26.1%) 1.000 34 (47.9%) 20 (24.7%) 0.005 0.107
Extremely abnormal >60 mL/m², n (%) 18 (18.8%) 8 (3.1%) <0.001 2 (8%) 7 (4%) 0.310 16 (22.5%) 1 (1.2%) <0.001 0.039
Right Ventricular Parameters
RVEDV, mL 144.1 ± 42.5 152.3 ± 48.7 0.132 163.5 ± 47.7 165.1 ± 47.4 0.877 137 ± 38.5 124.8 ± 39.3 0.059 0.250
RVESV, mL 78.7 ± 26.6 76 ± 25.5 0.417 86 ± 28.1 79.7 ± 24.3 0.300 76 ± 25.7 68.2 ± 26.4 0.075 0.844
RVEF, % 45.4 ± 8.3 49.5 ± 8.9 <0.001 47 ± 8.4 51.2 ± 8.9 0.031 44.8 ± 8.3 45.8 ± 7.9 0.470 0.173
Doppler Parameters
E velocity, m/s 0.6 ± 0.2 0.7 ± 0.2 <0.001 0.7 ± 0.2 0.8 ± 0.2 0.003 0.6 ± 0.2 0.6 ± 0.1 0.226 0.001
A velocity, m/s 0.5 ± 0.1 0.4 ± 0.1 0.019 0.4 ± 0.1 0.4 ± 0.1 0.184 0.5 ± 0.2 0.5 ± 0.1 0.029 0.009
E/A ratio 1.6 ± 1.3 1.9 ± 0.9 0.012 1.8 ± 0.5 2.3 ± 0.8 <0.001 1.5 ± 1.6 1.2 ± 0.4 0.122 <0.001
E′ velocity, cm/s 9.3 ± 2.9 11.4 ± 3.4 <0.001 11.5 ± 2.7 13 ± 2.5 0.015 8.6 ± 2.5 8 ± 2.2 0.104 0.001
E/E’ ratio 6.2 ± 2.4 5.7 ± 1.4 0.055 5 ± 1.1 5.2 ± 1 0.362 6.6 ± 2.6 6.6 ± 1.7 0.913 0.692
Electrocardiographic
P wave duration, ms 124 [114-131] 111 [101-121] <0.001 117 [98-129] 106 [96.8-115.2] 0.017 126 [117-132] 120 [112-129] 0.053 0.220
P wave axis, ° 51.3 ± 32.4 49.6 ± 23.9 0.667 50.1 ± 35.1 48.4 ± 24.8 0.822 51.8 ± 31.6 52 ± 21.5 0.956 0.790
PTFV₁, mV·ms 2.3 [1-4] 1.9 [1-3] 0.049 2 [1.2-3.2] 2 [1.3-3.1] 0.775 2.4 [0-4.3] 1.6 [0-2.4] <0.001 0.016
Advanced IAB, n (%) 12 (15%) 16 (6.3%) 0.026 0 (0%) 2 (1.1%) 1.000 12 (21.8%) 14 (17.5%) 0.687 0.412
Atrial ectopy, per 24 h 118 [20-521] 15 [6-47] <0.001 15.5 [7-35.2] 10 [5-34] 0.299 239 [69-919] 28 [12-98] <0.001 0.001
P-wave duration >120 ms, n (%) 51 (62.2%) 74 (28.8%) <0.001 12 (48%) 32 (18.2%) 0.002 39 (68.4%) 42 (51.9%) 0.077 0.208
PTFV₁ ≥4 mV·ms, n (%) 21 (25.6%) 23 (8.9%) <0.001 3 (12%) 19 (10.8%) 0.742 18 (31.6%) 4 (4.9%) <0.001 0.011
Biochemical
BNP, pg/mL 24.8 [12.7-55.8] 13.9 [8.5-24] <0.001 11 [7.9-21.3] 10 [7-17.5] 0.540 38.4 [18.1-63.5] 22.4 [14-37.8] 0.002 0.004
cTnI, ng/L 4 [2-7] 3 [2-8.2] 0.025 3 [2-4] 3 [2-12] 0.285 6 [3-8] 2 [2-4] <0.001 0.504
Genetic
AF-PRS percentile 64.5 [33-87.2] 51 [29-73] 0.016 73 [53-78] 49.5 [29-76.2] 0.065 62 [30.5-88.5] 52 [29-71] 0.092 0.698
AF-PRS highest quartile 34 (35.4%) 60 (23.3%) 0.032 7 (28%) 45 (25.6%) 0.987 27 (38%) 15 (18.5%) 0.012 0.145
AF-PRS highest decile 22 (22.9%) 26 (10.1%) 0.003 5 (20%) 19 (10.8%) 0.318 17 (23.9%) 7 (8.6%) 0.018 0.512
AF-PRS 30.2 [30.1-30.3] 30.2 [30.1-30.2] 0.015 30.2 [30.2-30.3] 30.2 [30.1-30.3] 0.066 30.2 [30.1-30.3] 30.2 [30.1-30.2] 0.091 0.989
AF-PRS highest quintile 29 (30.2%) 53 (20.6%) 0.079 6 (24%) 40 (22.7%) 1.000 23 (32.4%) 13 (16%) 0.030 0.175
AF-PRS ≥98th percentile 8 (8.5%) 6 (2.4%) 0.022 2 (8.3%) 4 (2.3%) 0.155 6 (8.6%) 2 (2.5%) 0.147 0.960
Rare Variant 17 (18.1%) 32 (12.5%) 0.251 4 (16.7%) 16 (9.1%) 0.273 13 (18.6%) 16 (20%) 0.989 0.303
Data are presented as mean ± SD, median (IQR), or n (%)
Bolded values indicate statistically significant pairwise comparisons (p<0.05)
Interaction p-values test whether AF associations differ between age groups

4 Clinical Risk Prediction Models

# Binary variables for clinical model
clinical_mapping <- list(
  "LAVi_34" = "LAVi > 34",
  "LAVi_60" = "LAVi > 60",
  "P_wave_120" = "Prevalence of P-wave duration >120 ms",
  "PTFV1_high" = "High PTFV1",
  "PAC_100" = "Prevalence of >=100 PACs/24 h",
  "highest_decile" = "highest decile"
)

Athletes_NoCM$Age_decades <- Athletes_NoCM$Age / 10
Athletes_NoCM$High_normal_BP <- as.integer(Athletes_NoCM$rest_SBP >= 130)

for (new_name in names(clinical_mapping)) {
  old_name <- clinical_mapping[[new_name]]
  if (old_name %in% names(Athletes_NoCM)) {
    Athletes_NoCM[[new_name]] <- as.integer(Athletes_NoCM[[old_name]] == 1)
  } else {
    Athletes_NoCM[[new_name]] <- NA_integer_
  }
}

# Exclude athletes with AF during monitoring
Athletes_modelling <- Athletes_NoCM %>%
  filter(is.na(`AF during ECG`) | `AF during ECG` != 1) %>%
  filter(is.na(`AF Throughout`) | `AF Throughout` != 1)

# Filter complete cases for modelling
Athletes_analysis <- Athletes_modelling %>%
  filter(!is.na(AF) & !is.na(Age) & !is.na(`AF centile`)) %>%
  filter(complete.cases(select(., Age_decades, High_normal_BP, LAVi_34, 
                              LAVi_60, P_wave_120, PTFV1_high, PAC_100)))

#Exclusions

# Calculate exclusions for Model B (Clinical-Only)
n_excluded_monitoring <- nrow(Athletes_NoCM) - nrow(Athletes_modelling)
n_excluded_monitoring_af <- sum((Athletes_NoCM$`AF during ECG` == 1 | 
                                 Athletes_NoCM$`AF Throughout` == 1) & 
                                Athletes_NoCM$AF == 1, na.rm = TRUE)
n_excluded_monitoring_noaf <- n_excluded_monitoring - n_excluded_monitoring_af

n_excluded_missing_clinical <- nrow(Athletes_modelling) - nrow(Athletes_analysis)
n_excluded_missing_clinical_af <- sum(Athletes_modelling$AF == 1, na.rm = TRUE) - 
                                   sum(Athletes_analysis$AF == 1)
n_excluded_missing_clinical_noaf <- n_excluded_missing_clinical - n_excluded_missing_clinical_af

# Calculate Model A (Genetic-Only) - needs PRS data
Athletes_model_A <- Athletes_modelling %>%
  filter(!is.na(AF) & !is.na(`highest decile`))
n_model_a <- nrow(Athletes_model_A)
n_model_a_af <- sum(Athletes_model_A$AF == 1)
n_model_a_noaf <- sum(Athletes_model_A$AF == 0)

# Calculate Model C (Clinical + Genetic) - needs both clinical and PRS
Athletes_model_C <- Athletes_modelling %>%
  filter(!is.na(AF) & !is.na(Age) & !is.na(`highest decile`)) %>%
  filter(complete.cases(select(., Age_decades, High_normal_BP, LAVi_34, 
                                LAVi_60, P_wave_120, PTFV1_high, PAC_100, highest_decile)))
n_model_c <- nrow(Athletes_model_C)
n_model_c_af <- sum(Athletes_model_C$AF == 1)
n_model_c_noaf <- sum(Athletes_model_C$AF == 0)

# Calculate how many excluded due to missing PRS (difference between B and C)
n_excluded_prs <- nrow(Athletes_analysis) - n_model_c
n_excluded_prs_af <- sum(Athletes_analysis$AF == 1) - n_model_c_af
n_excluded_prs_noaf <- n_excluded_prs - n_excluded_prs_af

# Exclusion Table
exclusion_flow <- data.frame(
  Step = c("Starting cohort", 
           "Excluded: AF during ECG/Holter", 
           "Excluded: Missing clinical data",
           "Model A: Genetic-Only",
           "Model B: Clinical-Only", 
           "Excluded: Missing PRS data",
           "Model C: Clinical + Genetic"),
  N = c(
    nrow(Athletes_NoCM),
    n_excluded_monitoring,
    n_excluded_missing_clinical,
    n_model_a,
    nrow(Athletes_analysis),
    n_excluded_prs,
    n_model_c
  ),
  AF_cases = c(
    sum(Athletes_NoCM$AF == 1, na.rm = TRUE),
    n_excluded_monitoring_af,
    n_excluded_missing_clinical_af,
    n_model_a_af,
    sum(Athletes_analysis$AF == 1),
    n_excluded_prs_af,
    n_model_c_af
  ),
  Non_AF = c(
    sum(Athletes_NoCM$AF == 0, na.rm = TRUE),
    n_excluded_monitoring_noaf,
    n_excluded_missing_clinical_noaf,
    n_model_a_noaf,
    sum(Athletes_analysis$AF == 0),
    n_excluded_prs_noaf,
    n_model_c_noaf
  )
)

kable(exclusion_flow,
      col.names = c("Cohort Stage", "Total (n)", "AF cases (n)", "Non-AF (n)"),
      align = c("l", "c", "c", "c"),
      caption = "Study Population Flow for Prediction Models") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(c(4, 5, 7), bold = TRUE, background = "#f0f0f0")  # Highlight the three models
Study Population Flow for Prediction Models
Cohort Stage Total (n) AF cases (n) Non-AF (n)
Starting cohort 353 96 257
Excluded: AF during ECG/Holter 15 15 0
Excluded: Missing clinical data 10 4 6
Model A: Genetic-Only 338 81 257
Model B: Clinical-Only 328 77 251
Excluded: Missing PRS data 0 0 0
Model C: Clinical + Genetic 328 77 251
# What data was missing
incomplete_only <- Athletes_modelling %>%
  filter(!complete.cases(select(., AF, Age, `AF centile`, Age_decades, High_normal_BP, 
                                LAVi_34, LAVi_60, P_wave_120, PTFV1_high, PAC_100)))

missing_summary <- data.frame(
  Variable = c("Age", "AF-PRS", "SBP ≥130 mmHg", "LAVi >34 mL/m²", 
               "LAVi >60 mL/m²", "P-wave >120 ms", "PTFV₁ ≥4 mV·ms", "PACs ≥100/24h"),
  Missing_n = c(
    sum(is.na(incomplete_only$Age)),
    sum(is.na(incomplete_only$`AF centile`)),
    sum(is.na(incomplete_only$High_normal_BP)),
    sum(is.na(incomplete_only$LAVi_34)),
    sum(is.na(incomplete_only$LAVi_60)),
    sum(is.na(incomplete_only$P_wave_120)),
    sum(is.na(incomplete_only$PTFV1_high)),
    sum(is.na(incomplete_only$PAC_100))
  )
)

missing_summary$Pct_Excluded <- round((missing_summary$Missing_n / nrow(incomplete_only)) * 100, 1)

kable(missing_summary, 
      col.names = c("Variable", "Missing (n)", "% of Excluded Cases"),
      align = c("l", "c", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Variable Missing (n) % of Excluded Cases
Age 0 0
AF-PRS 0 0
SBP ≥130 mmHg 1 10
LAVi >34 mL/m² 0 0
LAVi >60 mL/m² 0 0
P-wave >120 ms 0 0
PTFV₁ ≥4 mV·ms 0 0
PACs ≥100/24h 9 90

4.1 Model A: Genetic-Only

model_A <- glm(AF ~ highest_decile, data = Athletes_analysis, family = binomial())
Athletes_analysis$predicted_prob_A <- predict(model_A, type = "response")

roc_A <- pROC::roc(Athletes_analysis$AF, Athletes_analysis$predicted_prob_A, quiet = TRUE)
auc_A <- as.numeric(pROC::auc(roc_A))
ci_A <- pROC::ci.auc(roc_A)

model_A_summary <- broom::tidy(model_A, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")

cat("Model A:Genetic Risk (Highest Decile)\n\n")
## Model A:Genetic Risk (Highest Decile)
cat("AUC:", round(auc_A, 3), "(95% CI:", round(ci_A[1], 3), "-", round(ci_A[3], 3), ")\n")
## AUC: 0.561 (95% CI: 0.51 - 0.611 )
cat("OR:", round(model_A_summary$estimate, 2), 
    "(95% CI:", round(model_A_summary$conf.low, 2), "-", round(model_A_summary$conf.high, 2), ")\n")
## OR: 2.56 (95% CI: 1.28 - 5.03 )
cat("P-value:", format_p_value(model_A_summary$p.value), "\n")
## P-value: 0.007

4.2 Model B: Clinical-Only

# Fit model
model_B <- glm(AF ~ Age_decades + High_normal_BP + LAVi_34 + LAVi_60 +
                P_wave_120 + PTFV1_high + PAC_100,
              data = Athletes_analysis, family = binomial())

Athletes_analysis$predicted_prob_B <- predict(model_B, type = "response")

# ROC analysis
roc_B <- pROC::roc(Athletes_analysis$AF, Athletes_analysis$predicted_prob_B, quiet = TRUE)
auc_B <- as.numeric(pROC::auc(roc_B))
ci_B <- pROC::ci.auc(roc_B)

cat("Model B Performance\n\n")
## Model B Performance
cat("- AUC:", round(auc_B, 3), "(95% CI:", round(ci_B[1], 3), "-", round(ci_B[3], 3), ")\n\n")
## - AUC: 0.844 (95% CI: 0.791 - 0.897 )
# Derive point system
beta_values <- coef(model_B)[-1]
min_beta <- min(abs(beta_values))
derived_points <- round(abs(beta_values) / min_beta)

# Create coefficient table
model_B_coefs <- broom::tidy(model_B, conf.int = TRUE, exponentiate = FALSE)
model_B_or <- broom::tidy(model_B, conf.int = TRUE, exponentiate = TRUE)

model_B_summary <- model_B_coefs %>%
  filter(term != "(Intercept)") %>%
  mutate(
    Variable = case_when(
      term == "Age_decades" ~ "Age (per decade)",
      term == "High_normal_BP" ~ "SBP ≥130 mmHg",
      term == "LAVi_34" ~ "LAVi 35-60 mL/m²",
      term == "LAVi_60" ~ "LAVi >60 mL/m²",
      term == "P_wave_120" ~ "P-wave >120 ms",
      term == "PTFV1_high" ~ "PTFV₁ ≥4 mV·ms",
      term == "PAC_100" ~ "PACs ≥100/24h"
    ),
    OR = model_B_or$estimate[-1],
    OR_lower = model_B_or$conf.low[-1],
    OR_upper = model_B_or$conf.high[-1],
    Points = derived_points,
    Points_Display = if_else(term == "LAVi_60",
      paste0("+", derived_points[4], " (", derived_points[3] + derived_points[4], " total)"),
      as.character(derived_points))
  ) %>%
  select(Variable, `β` = estimate, SE = std.error, OR, 
         `OR CI lower` = OR_lower, `OR CI upper` = OR_upper, 
         `P value` = p.value, `Points Assigned` = Points_Display)

# Calculate risk scores
Athletes_analysis$clinical_score_B <- with(Athletes_analysis,
  Age_decades * derived_points[1] +
    High_normal_BP * derived_points[2] +
    if_else(LAVi_60 == 1, derived_points[3] + derived_points[4],
            if_else(LAVi_34 == 1, derived_points[3], 0)) +
    P_wave_120 * derived_points[5] +
    PTFV1_high * derived_points[6] +
    PAC_100 * derived_points[7]
)

# Model calibration
hl_test <- hoslem.test(Athletes_analysis$AF, Athletes_analysis$predicted_prob_B, g = 10)
cat("Calibration: Hosmer-Lemeshow χ² =", round(hl_test$statistic, 2), 
    ", p =", round(hl_test$p.value, 3), 
    if_else(hl_test$p.value > 0.05, "(good fit)\n\n", "(poor fit)\n\n"))
## Calibration: Hosmer-Lemeshow χ² = 6.56 , p = 0.584 (good fit)
# Optimal cutoff
coords_result <- coords(roc_B, "best", ret = c("threshold", "sensitivity", "specificity", "ppv", "npv"))
cat("Optimal Cutoff (Youden): Threshold =", round(coords_result$threshold, 3), "\n")
## Optimal Cutoff (Youden): Threshold = 0.22
cat("Sensitivity:", round(coords_result$sensitivity * 100, 1), 
    "% | Specificity:", round(coords_result$specificity * 100, 1), "%\n")
## Sensitivity: 80.5 % | Specificity: 77.3 %
cat("PPV:", round(coords_result$ppv * 100, 1), 
    "% | NPV:", round(coords_result$npv * 100, 1), "%\n\n")
## PPV: 52.1 % | NPV: 92.8 %
# Performance at clinical thresholds
thresholds <- c(0.10, 0.40, 0.70)
threshold_names <- c("Low-Moderate (10%)", "Moderate-High (40%)", "High-Very High (70%)")

perf_table <- map_dfr(1:length(thresholds), function(i) {
  predicted_class <- if_else(Athletes_analysis$predicted_prob_B >= thresholds[i], 1, 0)
  tp <- sum(predicted_class == 1 & Athletes_analysis$AF == 1)
  tn <- sum(predicted_class == 0 & Athletes_analysis$AF == 0)
  fp <- sum(predicted_class == 1 & Athletes_analysis$AF == 0)
  fn <- sum(predicted_class == 0 & Athletes_analysis$AF == 1)
  
  tibble(
    Threshold = threshold_names[i],
    Sensitivity = paste0(round(100 * tp/(tp+fn), 1), "%"),
    Specificity = paste0(round(100 * tn/(tn+fp), 1), "%"),
    PPV = paste0(round(100 * tp/(tp+fp), 1), "%"),
    NPV = paste0(round(100 * tn/(tn+fn), 1), "%")
  )
})

kable(perf_table, caption = "Model Performance at Clinical Thresholds") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model Performance at Clinical Thresholds
Threshold Sensitivity Specificity PPV NPV
Low-Moderate (10%) 92.2% 55.4% 38.8% 95.9%
Moderate-High (40%) 58.4% 90.4% 65.2% 87.6%
High-Very High (70%) 29.9% 98.4% 85.2% 82.1%
# Risk stratification
prob_thresholds <- c(0.10, 0.40, 0.70)
cutoffs <- round(map_dbl(prob_thresholds, ~min(Athletes_analysis$clinical_score_B[Athletes_analysis$predicted_prob_B >= .x], na.rm = TRUE)))
score_breaks <- c(-Inf, cutoffs, Inf)

Athletes_analysis <- Athletes_analysis %>%
  mutate(
    clinical_score_B_rounded = round(clinical_score_B),
    risk_category = cut(clinical_score_B_rounded, breaks = score_breaks,
                        labels = c("Low", "Moderate", "High", "Very High"),
                        include.lowest = TRUE)
  )

risk_summary <- Athletes_analysis %>%
  group_by(risk_category) %>%
  summarise(
    `Score Range` = paste0(min(clinical_score_B_rounded), "–", max(clinical_score_B_rounded)),
    N = n(),
    `AF Cases` = sum(AF),
    `AF Prevalence (%)` = round(mean(AF) * 100, 1),
    `Mean Age (years)` = round(mean(Age), 1),
    `High BP (%)` = round(mean(High_normal_BP) * 100, 1),
    `LAVi >34 (%)` = round(mean(LAVi_34 | LAVi_60) * 100, 1),
    `P-wave >120ms (%)` = round(mean(P_wave_120) * 100, 1),
    `High PTFV1 (%)` = round(mean(PTFV1_high) * 100, 1),
    `PACs ≥100/24h (%)` = round(mean(PAC_100) * 100, 1),
    .groups = "drop"
  ) %>%
  rename(`Risk Category` = risk_category)

4.3 Model C: Clinical-Genetic

model_C <- glm(AF ~ Age_decades + High_normal_BP + LAVi_34 + LAVi_60 +
                P_wave_120 + PTFV1_high + PAC_100 + highest_decile,
              data = Athletes_analysis, family = binomial())

Athletes_analysis$predicted_prob_C <- predict(model_C, type = "response")
roc_C <- pROC::roc(Athletes_analysis$AF, Athletes_analysis$predicted_prob_C, quiet = TRUE)
auc_C <- as.numeric(pROC::auc(roc_C))
ci_C <- pROC::ci.auc(roc_C)
delong_BC <- pROC::roc.test(roc_B, roc_C, method = "delong")

cat("Model C: Clinical + Genetic\n\n")
## Model C: Clinical + Genetic
cat("- AUC:", round(auc_C, 3), "(95% CI:", round(ci_C[1], 3), "-", round(ci_C[3], 3), ")\n")
## - AUC: 0.85 (95% CI: 0.798 - 0.902 )
cat("- Comparison to Model B: p =", format(delong_BC$p.value, digits = 3), "\n")
## - Comparison to Model B: p = 0.324

5 Table 3

kable(model_B_summary, digits = 3,
      caption = paste0("Table 3. Multivariable Logistic Regression Model Predicting Atrial Fibrillation (n=", nrow(Athletes_analysis), ")")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 3. Multivariable Logistic Regression Model Predicting Atrial Fibrillation (n=328)
Variable β SE OR OR CI lower OR CI upper P value Points Assigned
Age (per decade) 0.243 0.104 1.276 1.044 1.570 0.019 1
SBP ≥130 mmHg 0.971 0.338 2.640 1.372 5.190 0.004 4
LAVi 35-60 mL/m² 0.858 0.468 2.359 0.989 6.290 0.066 4
LAVi >60 mL/m² 0.878 0.642 2.406 0.673 8.473 0.171 +4 (8 total)
P-wave >120 ms 0.993 0.348 2.699 1.371 5.392 0.004 4
PTFV₁ ≥4 mV·ms 0.861 0.446 2.367 0.978 5.666 0.053 4
PACs ≥100/24h 1.535 0.358 4.644 2.311 9.444 0.000 6

6 Table 4

kable(risk_summary, 
      caption = "Table 4. Clinical Risk Stratification Categories") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 10)
Table 4. Clinical Risk Stratification Categories
Risk Category Score Range N AF Cases AF Prevalence (%) Mean Age (years) High BP (%) LAVi >34 (%) P-wave >120ms (%) High PTFV1 (%) PACs ≥100/24h (%)
Low 2–10 150 6 4.0 27.3 20.0 65.3 10.0 4.7 0.7
Moderate 11–17 110 26 23.6 44.8 58.2 87.3 47.3 11.8 21.8
High 18–24 46 26 56.5 59.9 60.9 95.7 76.1 23.9 63.0
Very High 25–32 22 19 86.4 67.7 86.4 95.5 90.9 40.9 100.0

7 Central Figure:

AF Polygenic Risk Score Distribution

if ("AF PRS 6238158var PGS000016" %in% names(Athletes_NoCM)) {
  ggplot(Athletes_NoCM, aes(x = factor(AF), y = `AF PRS 6238158var PGS000016`, 
                            fill = factor(AF))) +
    geom_violin(trim = FALSE, alpha = 0.8, color = NA) +
    geom_boxplot(width = 0.1, color = "black", fill = "white") +
    stat_compare_means(method = "t.test", label = "p.format", 
                       comparisons = list(c("0", "1")),
                       bracket.size = 0.7, size = 5, tip.length = 0.02) +
    scale_fill_manual(values = c("0" = "#0072B2", "1" = "#990000")) +
    scale_x_discrete(labels = c("No AF", "AF")) +
    labs(x = "", y = "AF Polygenic Risk Score") +
    theme_classic(base_size = 20) +
    theme(legend.position = "none")
} else {
  cat("AF PRS variable not found in dataset\n")
}
Legend = AF Polygenic Risk Score Distribution by AF Status

Legend = AF Polygenic Risk Score Distribution by AF Status

8 Figure 1

Figure 1. Bar Plot showing the prevalence of abnormal atrial myopathy markers in athletes with (n=96) and without (n=257) Atrial Fibrillation

# Ensure AF_status exists
Athletes_NoCM$AF_status <- factor(Athletes_NoCM$AF, levels = c(0, 1), labels = c("No AF", "AF"))

# Panel A: Individual criterion prevalence
criteria_prevalence <- Athletes_NoCM %>%
  group_by(AF_status) %>%
  summarise(
    `SBP ≥130 mmHg` = mean(High_normal_BP, na.rm = TRUE) * 100,
    `LAVi 35-60 mL/m²` = mean(LAVi_34 & !LAVi_60, na.rm = TRUE) * 100,
    `LAVi >60 mL/m²` = mean(LAVi_60, na.rm = TRUE) * 100,
    `P-wave >120 ms` = mean(P_wave_120, na.rm = TRUE) * 100,
      `PTFV₁ ≥4 mV·ms` = mean(PTFV1_high, na.rm = TRUE) * 100,
    `PACs ≥100/24h` = mean(PAC_100, na.rm = TRUE) * 100,
    `AF-PRS (top decile)` = mean(highest_decile, na.rm = TRUE) * 100,
    .groups = "drop"
  ) %>%
  pivot_longer(cols = -AF_status, names_to = "Criterion", values_to = "Percentage")

fig1a <- ggplot(criteria_prevalence, aes(x = reorder(Criterion, -Percentage), 
                                          y = Percentage, fill = AF_status)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), 
           color = "black", alpha = 0.8, width = 0.8) +
  scale_x_discrete(labels = function(x) {
    x <- gsub("PTFV₁", "PTFV1", x)  # Replace subscript with regular 1
    return(x)
  }) +
  scale_fill_manual(values = c("No AF" = "#0072B2", "AF" = "#990000")) +
  labs(x = "Atrial Myopathy Criterion", y = "Prevalence (%)", fill = "") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1),
        legend.position = "bottom")

# Panel B: Number of criteria met (overlap)
Athletes_NoCM$criteria_count <- rowSums(
  select(Athletes_NoCM, High_normal_BP, LAVi_34, LAVi_60, 
         P_wave_120, PTFV1_high, PAC_100, highest_decile),
  na.rm = TRUE
)

Athletes_NoCM$criteria_group <- factor(
  case_when(
    Athletes_NoCM$criteria_count == 0 ~ "0",
    Athletes_NoCM$criteria_count == 1 ~ "1",
    Athletes_NoCM$criteria_count == 2 ~ "2",
    Athletes_NoCM$criteria_count == 3 ~ "3",
    Athletes_NoCM$criteria_count == 4 ~ "4",
    Athletes_NoCM$criteria_count >= 5 ~ "≥5"
  ),
  levels = c("0", "1", "2", "3", "4", "≥5")
)

criteria_overlap <- Athletes_NoCM %>%
  count(AF_status, criteria_group) %>%
  group_by(AF_status) %>%
  mutate(percentage = n / sum(n) * 100) %>%
  ungroup() %>%
  complete(AF_status, criteria_group, fill = list(n = 0, percentage = 0))

fig1b <- ggplot(criteria_overlap, aes(x = criteria_group, y = percentage, 
                                       fill = AF_status)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), 
           color = "black", alpha = 0.8, width = 0.8) +
  geom_text(aes(label = paste0(round(percentage, 1), "%")),
            position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
  scale_fill_manual(values = c("No AF" = "#0072B2", "AF" = "#990000")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(x = "Number of Criteria Met", y = "Percentage (%)", fill = "") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

# Combine panels
ggarrange(fig1a, fig1b, ncol = 2, labels = c("A", "B"), 
          common.legend = TRUE, legend = "bottom")
Legend = Abnormal Atrial Myopathy Markers – AF-PRS = atrial fibrillation polygenic risk score; LAVi = left atrial volume index; PAC = premature atrial complex; PTFV1 = P wave terminal force velocity in lead V1; P-wave = P wave duration; SBP = systolic blood pressure.

Legend = Abnormal Atrial Myopathy Markers – AF-PRS = atrial fibrillation polygenic risk score; LAVi = left atrial volume index; PAC = premature atrial complex; PTFV1 = P wave terminal force velocity in lead V1; P-wave = P wave duration; SBP = systolic blood pressure.

9 Figure 2

Figure 2. Receiver-operating characteristic (ROC) curves for Model A (Genetic-Only) Model B (Clinical-Only), and Model C (Clinical-Genetic).

# Model A ROC
p2a <- ggplot(data.frame(fpr = 1 - roc_A$specificities,
                        tpr = roc_A$sensitivities),
             aes(x = fpr, y = tpr)) +
  geom_line(color = "#e9c46a", size = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  annotate("text", x = 0.6, y = 0.2,
           label = paste0("AUC = ", round(auc_A, 3),
                         "\n95% CI: ", round(ci_A[1], 3), "–", 
                         round(ci_A[3], 3)),
           size = 4) +
  labs(title = "Model A: Genetic-Only",
       x = "1 - Specificity", y = "Sensitivity") +
  coord_equal() +
  theme_minimal(base_size = 14)

# Model B ROC
p2b <- ggplot(data.frame(fpr = 1 - roc_B$specificities,
                        tpr = roc_B$sensitivities),
             aes(x = fpr, y = tpr)) +
  geom_line(color = "#a7c957", size = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  annotate("text", x = 0.6, y = 0.2,
           label = paste0("AUC = ", round(auc_B, 3),
                         "\n95% CI: ", round(ci_B[1], 3), "–", 
                         round(ci_B[3], 3)),
           size = 4) +
  labs(title = "Model B: Clinical-Only",
       x = "1 - Specificity", y = "Sensitivity") +
  coord_equal() +
  theme_minimal(base_size = 14)

# Model C ROC
p2c <- ggplot(data.frame(fpr = 1 - roc_C$specificities,
                        tpr = roc_C$sensitivities),
             aes(x = fpr, y = tpr)) +
  geom_line(color = "#6a994e", size = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  annotate("text", x = 0.6, y = 0.2,
           label = paste0("AUC = ", round(auc_C, 3),
                         "\n95% CI: ", round(ci_C[1], 3), "–", 
                         round(ci_C[3], 3)),
           size = 4) +
  labs(title = "Model C: Clinical-Genetic",
       x = "1 - Specificity", y = "Sensitivity") +
  coord_equal() +
  theme_minimal(base_size = 14)

# Combine plots
gridExtra::grid.arrange(p2a, p2b, p2c, ncol = 3)
Legend = AUC values (with 95% CIs) are annotated for each model. “Genetic Only” refers to AF centile as a continuous percentile predictor. “Clinical-Only” utilises clinical markers (systolic blood pressure ≥ 130 mmHg; P wave duration > 120 ms; P terminal force in V1 ≥ 4.0 mV·ms; left atrial volume index > 34 mL/m² and > 60 mL/m²; and PACs ≥ 100/24 h. “Clinical Genetic” combines Clinical markers) with the AF-PRS dichotomised at the top decile (≥ 90th percentile).

Legend = AUC values (with 95% CIs) are annotated for each model. “Genetic Only” refers to AF centile as a continuous percentile predictor. “Clinical-Only” utilises clinical markers (systolic blood pressure ≥ 130 mmHg; P wave duration > 120 ms; P terminal force in V1 ≥ 4.0 mV·ms; left atrial volume index > 34 mL/m² and > 60 mL/m²; and PACs ≥ 100/24 h. “Clinical Genetic” combines Clinical markers) with the AF-PRS dichotomised at the top decile (≥ 90th percentile).

10 Supplementary Tables

controls <- filter(NoCM, EICR == "Control")
athletes <- filter(NoCM, EICR == "Athlete")

# Helper function for age-adjusted p-values
calculate_age_adjusted_p <- function(data, var, group_var, is_categorical = FALSE) {
  if (!all(c(var, group_var, "Age") %in% names(data))) return(NA)
  clean_data <- data %>% filter(!is.na(.data[[var]]) & !is.na(.data[[group_var]]) & !is.na(Age))
  if (nrow(clean_data) < 10 || var == "Age") return(NA)
  
  tryCatch({
    if (is_categorical) {
      model <- glm(as.factor(clean_data[[var]]) ~ clean_data[[group_var]] + Age, 
                  data = clean_data, family = binomial())
    } else {
      model <- lm(as.numeric(clean_data[[var]]) ~ clean_data[[group_var]] + Age, 
                 data = clean_data)
    }
    coef_summary <- summary(model)$coefficients
    group_row <- grep(group_var, rownames(coef_summary))[1]
    if (is.na(group_row)) group_row <- 2
    return(coef_summary[group_row, 4])
  }, error = function(e) NA)
}

# Create comparison table
supp_table_athletes_controls <- data.frame(Variable = character(), stringsAsFactors = FALSE)
combined_data <- rbind(athletes %>% mutate(group = "Athletes"), controls %>% mutate(group = "Controls"))

for (var in all_variables) {
  if (!var %in% names(athletes) || !var %in% names(controls)) next
  
  is_cat <- var %in% categorical_vars
  is_nonnorm <- var %in% nonnormal_vars
  
  supp_table_athletes_controls <- rbind(supp_table_athletes_controls, data.frame(
    Variable = var,
    Athletes = format_stats(athletes, var, is_cat, is_nonnorm),
    Controls = format_stats(controls, var, is_cat, is_nonnorm),
    P_value = format_p_value(calculate_p_value(combined_data, var, "group", is_cat, is_nonnorm)),
    P_age_adj = format_p_value(calculate_age_adjusted_p(combined_data, var, "group", is_cat)),
    stringsAsFactors = FALSE
  ))
}

supp_table_athletes_controls$Category <- sapply(supp_table_athletes_controls$Variable, assign_category)
supp_table_athletes_controls$Variable <- sapply(supp_table_athletes_controls$Variable, clean_variable_names)

# Define correct order within each category
variable_order <- c(
  # Demographics
  "Age, years", "Male sex, n (%)", "BMI, kg/m²", "BSA, m²", "Average HR, bpm", "Minimum HR, bpm", 
  "Bradycardia burden, % (24h)", "Hypertension, n (%)", "SBP, mmHg", "DBP, mmHg", 
  "Dyslipidaemia, n (%)", "Diabetes, n (%)", "History of smoking, n (%)", "Alcohol, drinks/week",
  # Exercise Capacity
  "VO₂peak, mL/kg/min", "VO₂peak, L/min", "% of predicted exercise capacity", "Respiratory exchange ratio",
  "Total weekly MET-hours", "Years of endurance training",
  # Left ventricular parameters
  "LVEDV, mL", "LVESV, mL", "LVEF, %", "LV strain, %",
  # Left atrial parameters
  "LAESV, mL", "LAVi, mL/m²", "Mildly abnormal >34 mL/m², n (%)", "Severely abnormal >48 mL/m², n (%)",
  "Extremely abnormal >60 mL/m², n (%)", "LA:LV", "Reservoir strain, %", "Conduit strain, %", "Contractile strain, %",
  # Right Ventricular Parameters
  "RVEDV, mL", "RVESV, mL", "RVEF, %",
  # Doppler Parameters
  "E velocity, m/s", "A velocity, m/s", "E/A ratio", "E′ velocity, cm/s", "E/E' ratio",
  # Electrocardiographic
  "P wave duration, ms", "P-wave duration >120 ms, n (%)", "P wave axis, °", "PTFV₁, mV·ms",
  "PTFV₁ ≥4 mV·ms, n (%)", "Advanced IAB, n (%)", "PACs per 24 h", "≥100 PACs per 24 h, n (%)",
  # Biochemical
  "BNP, pg/mL", "cTnI, ng/L"
)

# Order the table by the defined sequence
supp_table_athletes_controls$Variable <- factor(supp_table_athletes_controls$Variable, 
                                                 levels = variable_order)
supp_table_athletes_controls <- supp_table_athletes_controls %>% arrange(Variable)
supp_table_athletes_controls$Variable <- as.character(supp_table_athletes_controls$Variable)

# Split and display with correct ordering
table1_cats <- c("Demographics", "Exercise Capacity")

# For table 2, create nested structure for echocardiographic parameters
supp_1_cat <- supp_table_athletes_controls %>% 
  filter(Category %in% table1_cats) %>%
  mutate(Category = factor(Category, levels = table1_cats))

# Create hierarchical grouping for Table 2
supp_2_cat <- supp_table_athletes_controls %>% 
  filter(Category %in% c("Left ventricular parameters", "Left atrial parameters", 
                         "Right Ventricular Parameters", "Doppler Parameters", 
                         "Electrocardiographic", "Biochemical"))

# Add parent category for nested structure
supp_2_cat <- supp_2_cat %>%
  mutate(ParentCategory = case_when(
    Category %in% c("Left ventricular parameters", "Left atrial parameters") ~ "Echocardiographic",
    TRUE ~ Category
  ))

supp_1 <- supp_1_cat %>% select(-Category)
supp_2 <- supp_2_cat %>% select(-Category, -ParentCategory)

colnames(supp_1) <- c("Characteristic", paste0("Athletes (n=", nrow(athletes), ")"), 
                      paste0("Controls (n=", nrow(controls), ")"), "p-value", "Age-adjusted p value")
colnames(supp_2) <- colnames(supp_1)

cat("\n## Supplementary Table 1\n\n")

10.1 Supplementary Table 1

print(kable(supp_1, caption = "Supplementary Table 1. Baseline Demographics: Endurance Athletes versus Non-athletic Controls (n=450)",
      align = c("l", "c", "c", "c", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE, font_size = 10) %>%
  pack_rows(index = table(supp_1_cat$Category)[table1_cats]) %>%
  add_footnote(c("Data are presented as mean ± standard deviation, median (interquartile range), or n (%). P values were calculated using unpaired t-tests for normally distributed continuous variables, Mann-Whitney U tests for non-normally distributed continuous variables, and chi-square tests for categorical variables. Age-adjusted p values were calculated using multivariable regression models with age as a covariate to account for age differences between groups.\nAbbreviations: BMI, body mass index; BSA, body surface area; DBP, diastolic blood pressure; HR, heart rate; SBP, systolic blood pressure; VO₂peak, peak oxygen uptake."),
               notation = "none"))
Supplementary Table 1. Baseline Demographics: Endurance Athletes versus Non-athletic Controls (n=450)
Characteristic Athletes (n=353) Controls (n=97) p-value Age-adjusted p value
Demographics
Age, years 41 [22-60] 34 [22-54] 0.315
Male sex, n (%) 262 (74.2%) 62 (63.9%) 0.061 0.071
BMI, kg/m² 23.5 [21.9-25.5] 24.5 [22.3-27] 0.023 <0.001
BSA, m² 1.9 [1.8-2.1] 1.9 [1.7-2] 0.123 0.122
Average HR, bpm 66.4 ± 9.1 73.6 ± 8 <0.001 <0.001
Minimum HR, bpm 44 [39-49] 51 [45-55] <0.001 <0.001
Bradycardia burden, % (24h) 6.4 [0.4-26.9] 0 [0-3] <0.001 <0.001
Hypertension, n (%) 31 (8.9%) 1 (1.6%) 0.067 0.847
SBP, mmHg 126 [117-135] 121 [115-129] 0.006 0.106
DBP, mmHg 69 [62-77.2] 69 [65-76] 0.656 0.045
Dyslipidaemia, n (%) 45 (12.9%) 0 (0%) <0.001 0.985
Diabetes, n (%) 2 (0.6%) 0 (0%) 1.000 0.998
History of smoking, n (%) 47 (13.5%) 10 (14.5%) 0.972 0.021
Alcohol, drinks/week 1 [0-6] 3 [3-3] 0.531 0.467
Exercise Capacity
VO₂peak, mL/kg/min 48.9 ± 14.3 35.8 ± 9.2 <0.001 <0.001
VO₂peak, L/min 3669.5 ± 997.4 2758.8 ± 865.9 <0.001 <0.001
% of predicted exercise capacity 126.4 ± 20.8 97.1 ± 22 <0.001 <0.001
Respiratory exchange ratio 1.3 ± 0.1 1.3 ± 0.1 <0.001 <0.001
Total weekly MET-hours 81 [49-104]
Years of endurance training 20 [10-30]
Data are presented as mean ± standard deviation, median (interquartile range), or n (%). P values were calculated using unpaired t-tests for normally distributed continuous variables, Mann-Whitney U tests for non-normally distributed continuous variables, and chi-square tests for categorical variables. Age-adjusted p values were calculated using multivariable regression models with age as a covariate to account for age differences between groups.
Abbreviations: BMI, body mass index; BSA, body surface area; DBP, diastolic blood pressure; HR, heart rate; SBP, systolic blood pressure; VO₂peak, peak oxygen uptake.
cat("\n## Supplementary Table 2\n\n")

10.2 Supplementary Table 2

# Create pack_rows structure for Table 2 - all categories at same level
# Use the same approach as Table 1 for consistency

# Define the order of categories for Table 2
table2_cat_order <- c("Left ventricular parameters", "Left atrial parameters", 
                      "Right Ventricular Parameters", "Doppler Parameters", 
                      "Electrocardiographic", "Biochemical")

# Ensure categories are in the right order
supp_2_cat <- supp_2_cat %>%
  mutate(Category = factor(Category, levels = table2_cat_order))

table2_output <- kable(supp_2, caption = "Supplementary Table 2. Cardiac Parameters: Athletes versus Controls (n=450)",
      align = c("l", "c", "c", "c", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE, font_size = 10) %>%
  pack_rows(index = table(supp_2_cat$Category)[table2_cat_order])

print(table2_output %>%
  add_footnote(c("Data are presented as mean ± standard deviation, median (interquartile range), or n (%). Statistical comparisons as per Supplementary Table 1.\nAbbreviations: BNP, brain natriuretic peptide; cTnI, cardiac troponin I; E/A, mitral inflow early/late diastolic velocity ratio; E′, mitral annular early diastolic velocity; EDV, end-diastolic volume; ESV, end-systolic volume; IAB, interatrial block; LA, left atrium; LAVi, left atrial volume index; LA:LV, left atrial to left ventricular end-diastolic volume ratio; LV, left ventricle; LVEF, left ventricular ejection fraction; PAC, premature atrial contraction; PTFV₁, P-wave terminal force in lead V1; RV, right ventricle; RVEF, right ventricular ejection fraction."), 
               notation = "none"))
Supplementary Table 2. Cardiac Parameters: Athletes versus Controls (n=450)
Characteristic Athletes (n=353) Controls (n=97) p-value Age-adjusted p value
Left ventricular parameters
LVEDV, mL 166.9 ± 45.3 118.2 ± 36 <0.001 <0.001
LVESV, mL 68.3 ± 22.7 50.1 ± 17 <0.001 <0.001
LVEF, % 58.1 ± 5.5 57.4 ± 4.6 0.639 0.714
LV strain, % -18.9 ± 2.3 -19 ± 2 0.662 0.796
Left atrial parameters
LAESV, mL 83 [69-98.2] 55.4 [47.2-68.5] <0.001 <0.001
LAVi, mL/m² 42.9 [35.7-49.8] 29.7 [25.4-35.7] <0.001 <0.001
Mildly abnormal >34 mL/m², n (%) 283 (80.2%) 27 (28.7%) <0.001 <0.001
Severely abnormal >48 mL/m², n (%) 107 (30.3%) 4 (4.3%) <0.001 <0.001
Extremely abnormal >60 mL/m², n (%) 26 (7.4%) 1 (1.1%) 0.025 0.071
LA:LV 0.49 [0.42-0.62] 0.48 [0.41-0.58] 0.508 0.857
Reservoir strain, % 30.4 ± 7.4 34 ± 6.3 <0.001 <0.001
Conduit strain, % 19.3 ± 6.9 21 ± 5.7 0.021 0.385
Contractile strain, % 11.2 ± 4.7 13 ± 3.8 <0.001 <0.001
Right Ventricular Parameters
RVEDV, mL 150.2 ± 47.2 90.9 ± 35.7 <0.001 <0.001
RVESV, mL 76.7 ± 25.8 51.7 ± 20.9 <0.001 <0.001
RVEF, % 48.4 ± 8.9 42.6 ± 8.4 <0.001 <0.001
Doppler Parameters
E velocity, m/s 0.7 ± 0.2 0.7 ± 0.2 0.452 0.824
A velocity, m/s 0.4 ± 0.1 0.5 ± 0.1 0.032 <0.001
E/A ratio 1.8 ± 1 1.7 ± 0.7 0.089 0.010
E′ velocity, cm/s 10.9 ± 3.4 10.8 ± 3.4 0.962 0.021
E/E’ ratio 5.8 ± 1.7 6.3 ± 2.1 0.032 <0.001
Electrocardiographic
P wave duration, ms 114 [102.5-125] 107.5 [102-112.2] <0.001 <0.001
P-wave duration >120 ms, n (%) 125 (36.9%) 12 (12.6%) <0.001 <0.001
P wave axis, ° 50 ± 26.1 54.3 ± 31 0.237 0.178
PTFV₁, mV·ms 2 [1-3.2] 1.8 [0.8-3] 0.987 0.944
PTFV₁ ≥4 mV·ms, n (%) 44 (13%) 7 (8.6%) 0.376 0.344
Advanced IAB, n (%) 28 (8.4%) 0 (0%) 0.022 0.989
PACs per 24 h 20 [7-91.5] 6 [2-24.5] <0.001 0.345
≥100 PACs per 24 h, n (%) 76 (23%) 7 (8.3%) 0.004 0.047
Biochemical
BNP, pg/mL 15.7 [9.6-29.2] 10 [5.5-13.4] <0.001 0.465
cTnI, ng/L 3 [2-8] 2 [1-3] <0.001 0.266
Data are presented as mean ± standard deviation, median (interquartile range), or n (%). Statistical comparisons as per Supplementary Table 1.
Abbreviations: BNP, brain natriuretic peptide; cTnI, cardiac troponin I; E/A, mitral inflow early/late diastolic velocity ratio; E′, mitral annular early diastolic velocity; EDV, end-diastolic volume; ESV, end-systolic volume; IAB, interatrial block; LA, left atrium; LAVi, left atrial volume index; LA:LV, left atrial to left ventricular end-diastolic volume ratio; LV, left ventricle; LVEF, left ventricular ejection fraction; PAC, premature atrial contraction; PTFV₁, P-wave terminal force in lead V1; RV, right ventricle; RVEF, right ventricular ejection fraction.
# Define risk factors and their thresholds based on your document
risk_factors <- list(
  "Age" = list(var = "Age", threshold = 45, type = "continuous", cutoff_type = ">"),
  "SBP" = list(var = "rest_SBP", threshold = 130, type = "continuous", cutoff_type = "≥"),
  "LAVi_normal" = list(var = "LAVi", threshold = c(16, 34), type = "range", label = "Normal Range (16-34)"),
  "LAVi_severe" = list(var = "LAVi", threshold = 48, type = "continuous", cutoff_type = "≥", label = "Severely dilated (≥48)"),
  "LAVi_extreme" = list(var = "LAVi", threshold = 60, type = "continuous", cutoff_type = "≥", label = "Extremely dilated (≥60)"),
  "P_wave" = list(var = "P wave Duration", threshold = 120, type = "continuous", cutoff_type = ">"),
  "PTFV1" = list(var = "PTFV1 (ms*mV)", threshold = 4, type = "continuous", cutoff_type = "≥"),
  "PACs" = list(var = "ECTOPIES LS", threshold = 100, type = "continuous", cutoff_type = "≥"),
  "PRS_Q4" = list(var = "highest quartile", threshold = 1, type = "binary"),
  "PRS_D10" = list(var = "highest decile", threshold = 1, type = "binary"),
  "PRS_P98" = list(var = "98th percentile", threshold = 1, type = "binary")
)

# Function to calculate OR and CI
calculate_or_ci <- function(data, var_info) {
  clean_data <- data %>% filter(!is.na(AF) & !is.na(.data[[var_info$var]]))
  
  if (nrow(clean_data) < 10) return(NULL)
  
  tryCatch({
    # Create binary exposure variable based on threshold
    if (var_info$type == "range") {
      clean_data$exposure <- as.numeric(clean_data[[var_info$var]] >= var_info$threshold[1] & 
                                        clean_data[[var_info$var]] <= var_info$threshold[2])
    } else if (var_info$type == "binary") {
      clean_data$exposure <- as.numeric(clean_data[[var_info$var]] == var_info$threshold)
    } else {
      if (var_info$cutoff_type == "≥" | var_info$cutoff_type == ">") {
        clean_data$exposure <- as.numeric(clean_data[[var_info$var]] >= var_info$threshold)
      } else {
        clean_data$exposure <- as.numeric(clean_data[[var_info$var]] > var_info$threshold)
      }
    }
    
    # Fit logistic regression
    model <- glm(AF ~ exposure, data = clean_data, family = binomial())
    
    # Get results
    results <- broom::tidy(model, conf.int = TRUE, exponentiate = TRUE) %>%
      filter(term == "exposure")
    
    # Count cases
    af_exposed <- sum(clean_data$AF == 1 & clean_data$exposure == 1, na.rm = TRUE)
    noaf_exposed <- sum(clean_data$AF == 0 & clean_data$exposure == 1, na.rm = TRUE)
    
    return(list(
      n_af = af_exposed,
      pct_af = round(100 * af_exposed / sum(clean_data$AF == 1, na.rm = TRUE), 0),
      n_noaf = noaf_exposed,
      pct_noaf = round(100 * noaf_exposed / sum(clean_data$AF == 0, na.rm = TRUE), 0),
      or = results$estimate,
      ci_lower = results$conf.low,
      ci_upper = results$conf.high,
      p_value = results$p.value
    ))
  }, error = function(e) NULL)
}

# Build table
supp_table_3 <- data.frame(
  Criterion = character(),
  Threshold = character(),
  AF = character(),
  NoAF = character(),
  OR_CI = character(),
  P_value = character(),
  stringsAsFactors = FALSE
)

# Process each risk factor
for (rf_name in names(risk_factors)) {
  rf <- risk_factors[[rf_name]]
  
  if (!rf$var %in% names(Athletes_NoCM)) next
  
  result <- calculate_or_ci(Athletes_NoCM, rf)
  
  if (!is.null(result)) {
    # Format criterion name
    criterion <- case_when(
      rf_name == "Age" ~ "Age",
      rf_name == "SBP" ~ "Systolic Blood Pressure",
      rf_name == "LAVi_normal" ~ "Left Atrial Volume Index",
      rf_name == "LAVi_severe" ~ "Left Atrial Volume Index",
      rf_name == "LAVi_extreme" ~ "Left Atrial Volume Index",
      rf_name == "P_wave" ~ "P-wave duration",
      rf_name == "PTFV1" ~ "PTFV₁",
      rf_name == "PACs" ~ "Premature Atrial Contractions",
      rf_name == "PRS_Q4" ~ "AF-PRS",
      rf_name == "PRS_D10" ~ "AF-PRS",
      rf_name == "PRS_P98" ~ "AF-PRS",
      TRUE ~ rf_name
    )
    
    # Format threshold
    threshold <- if (!is.null(rf$label)) {
      rf$label
    } else if (rf$type == "range") {
      paste0(rf$threshold[1], "-", rf$threshold[2], " mL/m²")
    } else if (rf_name == "Age") {
      ">45 years"
    } else if (rf_name == "SBP") {
      "≥130 mmHg"
    } else if (rf_name == "P_wave") {
      ">120 ms"
    } else if (rf_name == "PTFV1") {
      "≥4 mV·ms"
    } else if (rf_name == "PACs") {
      "≥100/24h"
    } else if (rf_name == "PRS_Q4") {
      "Highest Quartile (Q4)"
    } else if (rf_name == "PRS_D10") {
      "Highest Decile (D10)"
    } else if (rf_name == "PRS_P98") {
      "98th Percentile"
    } else {
      paste0(rf$cutoff_type, rf$threshold)
    }
    
    supp_table_3 <- rbind(supp_table_3, data.frame(
      Criterion = criterion,
      Threshold = threshold,
      AF = paste0(result$n_af, " (", result$pct_af, "%)"),
      NoAF = paste0(result$n_noaf, " (", result$pct_noaf, "%)"),
      OR_CI = paste0(sprintf("%.1f", result$or), " (", 
                    sprintf("%.1f", result$ci_lower), "–",
                    sprintf("%.1f", result$ci_upper), ")"),
      P_value = format_p_value(result$p_value),
      stringsAsFactors = FALSE
    ))
  }
}

# Rename columns
colnames(supp_table_3) <- c(
  "Criterion",
  "Threshold",
  paste0("AF (n=", sum(Athletes_NoCM$AF == 1, na.rm = TRUE), ")"),
  paste0("No AF (n=", sum(Athletes_NoCM$AF == 0, na.rm = TRUE), ")"),
  "Odds Ratio (95% CI)",
  "p value"
)

cat("\n## Supplementary Table 3\n\n")

10.3 Supplementary Table 3

print(kable(supp_table_3,
      caption = "Supplementary Table 3. Univariable Risk Factors for Atrial Fibrillation Development",
      align = c("l", "l", "c", "c", "c", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 10) %>%
  add_footnote(c("Legend = Data are presented as n (%). Odds ratios with 95% confidence intervals represent the increased odds of atrial fibrillation for athletes meeting each threshold criterion compared to those below threshold. P values were calculated using logistic regression. Abbreviations: AF, atrial fibrillation; CI, confidence interval; LAVi, left atrial volume index; OR, odds ratio; PAC, premature atrial contraction; PRS, polygenic risk score; PTFV₁, P-wave terminal force in lead V1; Q4, fourth quartile; D10, tenth decile."), notation = "none"))
Supplementary Table 3. Univariable Risk Factors for Atrial Fibrillation Development
Criterion Threshold AF (n=96) No AF (n=257) Odds Ratio (95% CI) p value
Age >45 years 74 (77%) 85 (33%) 6.8 (4.0–11.9) <0.001
Systolic Blood Pressure ≥130 mmHg 58 (60%) 95 (37%) 2.6 (1.6–4.2) <0.001
Left Atrial Volume Index Normal Range (16-34) 9 (9%) 62 (24%) 0.3 (0.1–0.7) 0.003
Left Atrial Volume Index Severely dilated (≥48) 41 (43%) 66 (26%) 2.2 (1.3–3.5) 0.002
Left Atrial Volume Index Extremely dilated (≥60) 18 (19%) 8 (3%) 7.2 (3.1–18.1) <0.001
P-wave duration >120 ms 51 (62%) 74 (29%) 4.1 (2.4–6.9) <0.001
PTFV₁ ≥4 mV·ms 21 (26%) 23 (9%) 3.5 (1.8–6.7) <0.001
Premature Atrial Contractions ≥100/24h 45 (56%) 33 (13%) 8.3 (4.7–14.9) <0.001
AF-PRS Highest Quartile (Q4) 34 (35%) 60 (23%) 1.8 (1.1–3.0) 0.023
AF-PRS Highest Decile (D10) 22 (23%) 26 (10%) 2.6 (1.4–4.9) 0.002
AF-PRS 98th Percentile 8 (9%) 6 (2%) 3.9 (1.3–12.0) 0.015
Legend = Data are presented as n (%). Odds ratios with 95% confidence intervals represent the increased odds of atrial fibrillation for athletes meeting each threshold criterion compared to those below threshold. P values were calculated using logistic regression. Abbreviations: AF, atrial fibrillation; CI, confidence interval; LAVi, left atrial volume index; OR, odds ratio; PAC, premature atrial contraction; PRS, polygenic risk score; PTFV₁, P-wave terminal force in lead V1; Q4, fourth quartile; D10, tenth decile.
# Use the already-calculated Model C dataset from the exclusions section
Athletes_complete <- Athletes_model_C

cat("Complete cases for multivariable model:", nrow(Athletes_complete), "\n")

Complete cases for multivariable model: 328

cat("AF cases:", sum(Athletes_complete$AF), "\n\n")

AF cases: 77

if (nrow(Athletes_complete) >= 50) {
  # Fit multivariable model
  mv_model <- glm(AF ~ Age_decades + High_normal_BP + LAVi_34 + LAVi_60 + 
                    P_wave_120 + PTFV1_high + PAC_100 + highest_decile,
                  data = Athletes_complete,
                  family = binomial())
  
  # Get coefficients
  mv_coefs <- broom::tidy(mv_model, conf.int = TRUE, exponentiate = FALSE) %>%
    filter(term != "(Intercept)")
  
  mv_or <- broom::tidy(mv_model, conf.int = TRUE, exponentiate = TRUE) %>%
    filter(term != "(Intercept)")
  
  # Calculate points (from beta coefficients)
  beta_values <- mv_coefs$estimate
  min_beta <- min(abs(beta_values))
  points <- round(abs(beta_values) / min_beta)
  
  # Create summary table
  supp_table_4 <- data.frame(
    Variable = c("Age (per 10 years)", "SBP ≥130 mmHg", "LAVi >34 mL/m²",
                 "LAVi >60 mL/m²", "P-wave >120 ms", "PTFV₁ ≥4 mV·ms",
                 "PACs ≥100/24h", "AF-PRS top decile"),
    Beta = sprintf("%.3f", mv_coefs$estimate),
    SE = sprintf("%.3f", mv_coefs$std.error),
    OR = sprintf("%.2f", mv_or$estimate),
    CI_lower = sprintf("%.2f", mv_or$conf.low),
    CI_upper = sprintf("%.2f", mv_or$conf.high),
    P_value = sapply(mv_coefs$p.value, format_p_value),
    Points = points,
    stringsAsFactors = FALSE
  )
  
  colnames(supp_table_4) <- c(
    "Variable",
    "β (coefficient)",
    "SE",
    "OR",
    "CI lower",
    "CI upper",
    "p value",
    "Points Assigned"
  )
  
  cat("\n## Supplementary Table 4\n\n")
  print(kable(supp_table_4,
        caption = paste0("Supplementary Table 4. Multivariable Logistic Regression Model Predicting Atrial Fibrillation in Endurance Athletes (n=", nrow(Athletes_complete), ")"),
        align = c("l", "c", "c", "c", "c", "c", "c", "c")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                  full_width = TRUE, font_size = 10) %>%
    add_footnote(c("Extended model incorporating genetic risk assessment. Statistical presentation as per Table 3. Abbreviations: AF, atrial fibrillation; CI, confidence interval; LAVi, left atrial volume index; OR, odds ratio; PAC, premature atrial contraction; PRS, polygenic risk score; PTFV₁, P-wave terminal force in lead V1; SBP, systolic blood pressure."),
               notation = "none"))
  
  # Print model performance
  cat("\n**Model Performance:**\n")
  cat("- AIC:", round(AIC(mv_model), 1), "\n")
  cat("- Deviance:", round(deviance(mv_model), 1), "\n")
  
} else {
  cat("Insufficient complete cases for multivariable modelling\n")
}

10.4 Supplementary Table 4

Supplementary Table 4. Multivariable Logistic Regression Model Predicting Atrial Fibrillation in Endurance Athletes (n=328)
Variable β (coefficient) SE OR CI lower CI upper p value Points Assigned
Age (per 10 years) 0.247 0.105 1.28 1.05 1.58 0.018 1
SBP ≥130 mmHg 0.970 0.341 2.64 1.36 5.23 0.004 4
LAVi >34 mL/m² 0.857 0.465 2.36 0.99 6.24 0.065 3
LAVi >60 mL/m² 0.983 0.658 2.67 0.72 9.68 0.135 4
P-wave >120 ms 0.970 0.354 2.64 1.32 5.33 0.006 4
PTFV₁ ≥4 mV·ms 0.731 0.458 2.08 0.83 5.08 0.111 3
PACs ≥100/24h 1.555 0.365 4.74 2.33 9.78 <0.001 6
AF-PRS top decile 0.948 0.459 2.58 1.04 6.33 0.039 4
Extended model incorporating genetic risk assessment. Statistical presentation as per Table 3. Abbreviations: AF, atrial fibrillation; CI, confidence interval; LAVi, left atrial volume index; OR, odds ratio; PAC, premature atrial contraction; PRS, polygenic risk score; PTFV₁, P-wave terminal force in lead V1; SBP, systolic blood pressure.

Model Performance: - AIC: 266.9 - Deviance: 248.9

# Find matching data
matched_cases <- matched_controls <- data.frame()
for (dataset in list(Athletes_NoCM, NoCM, AM_paper)) {
  if ("Matching" %in% names(dataset)) {
    temp <- dataset %>% filter(!is.na(Matching) & Matching %in% c("Case", "Control"))
    if (nrow(temp) > 0) {
      matched_cases <- temp %>% filter(Matching == "Case")
      matched_controls <- temp %>% filter(Matching == "Control")
      break
    }
  }
}

if (nrow(matched_cases) > 0 && nrow(matched_controls) > 0) {
  supp_matched <- data.frame(Variable = character(), stringsAsFactors = FALSE)
  matched_combined <- rbind(matched_cases %>% mutate(group = "AF"), matched_controls %>% mutate(group = "No AF"))
  
  for (var in all_variables) {
    if (!var %in% names(matched_cases) || !var %in% names(matched_controls)) next
    is_cat <- var %in% categorical_vars
    is_nonnorm <- var %in% nonnormal_vars
    
    supp_matched <- rbind(supp_matched, data.frame(
      Variable = var,
      AF = format_stats(matched_cases, var, is_cat, is_nonnorm),
      NoAF = format_stats(matched_controls, var, is_cat, is_nonnorm),
      P = format_p_value(calculate_p_value(matched_combined, var, "group", is_cat, is_nonnorm)),
      stringsAsFactors = FALSE
    ))
  }
  
  supp_matched$Category <- sapply(supp_matched$Variable, assign_category)
  supp_matched$Variable <- sapply(supp_matched$Variable, clean_variable_names)
  
  # Order variables the same way as Tables 1 and 2
  supp_matched$Variable <- factor(supp_matched$Variable, levels = variable_order)
  supp_matched <- supp_matched %>% arrange(Variable)
  supp_matched$Variable <- as.character(supp_matched$Variable)
  
  # Table 5: Demographics and Exercise Capacity
  table5_cats <- c("Demographics", "Exercise Capacity")
  supp_5_cat <- supp_matched %>% 
    filter(Category %in% table5_cats) %>%
    mutate(Category = factor(Category, levels = table5_cats))
  
  # Table 6: Cardiac Parameters with same structure as Table 2
  table6_cat_order <- c("Left ventricular parameters", "Left atrial parameters", 
                        "Right Ventricular Parameters", "Doppler Parameters", 
                        "Electrocardiographic", "Biochemical")
  
  supp_6_cat <- supp_matched %>% 
    filter(Category %in% table6_cat_order) %>%
    mutate(Category = factor(Category, levels = table6_cat_order))
  
  supp_5 <- supp_5_cat %>% select(-Category)
  supp_6 <- supp_6_cat %>% select(-Category)
  
  colnames(supp_5) <- c("Characteristic", paste0("AF (n=", nrow(matched_cases), ")"), 
                        paste0("No AF (n=", nrow(matched_controls), ")"), "p value")
  colnames(supp_6) <- colnames(supp_5)
  
  cat("\n## Supplementary Table 5\n\n")
  print(kable(supp_5, caption = "Supplementary Table 5. Age- and Sex-Matched Analysis: Baseline Characteristics (n=160)",
        align = c("l", "c", "c", "c")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE, font_size = 10) %>%
    pack_rows(index = table(supp_5_cat$Category)[table5_cats]) %>%
    add_footnote(c("Data presentation and statistical methods as per Supplementary Table 1. Cases and controls were matched 1:1 on age (±5 years) and sex.\nAbbreviations: As per Supplementary Table 1; AF, atrial fibrillation."), 
                 notation = "none"))
  
  cat("\n## Supplementary Table 6\n\n")
  print(kable(supp_6, caption = "Supplementary Table 6. Age- and Sex-Matched Analysis: Cardiac Parameters (n=160)",
        align = c("l", "c", "c", "c")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE, font_size = 10) %>%
    pack_rows(index = table(supp_6_cat$Category)[table6_cat_order]) %>%
    add_footnote(c("Data presentation, statistical methods and abbreviations as per Supplementary Table 2."), 
                 notation = "none"))
} else {
  cat("Warning: No matched data available\n")
}

10.5 Supplementary Table 5

Supplementary Table 5. Age- and Sex-Matched Analysis: Baseline Characteristics (n=160)
Characteristic AF (n=80) No AF (n=80) p value
Demographics
Age, years 56.5 [43.8-64.2] 55 [43-65] 0.868
Male sex, n (%) 73 (91.2%) 73 (91.2%) 1.000
BMI, kg/m² 24.5 [23.2-25.7] 24.6 [22.9-26.7] 0.929
BSA, m² 2 [1.9-2.1] 2.1 [1.9-2.2] 0.328
Average HR, bpm 63.5 ± 6.8 65.9 ± 7.1 0.048
Minimum HR, bpm 45 [40-48.5] 46 [41-49] 0.322
Bradycardia burden, % (24h) 5.6 [0.4-23.5] 2.5 [0.3-21.8] 0.474
Hypertension, n (%) 9 (11.5%) 14 (17.5%) 0.403
SBP, mmHg 131 [124-140.5] 130 [122-137] 0.581
DBP, mmHg 74.5 [69-81] 74 [65-79.2] 0.384
Dyslipidaemia, n (%) 17 (21.8%) 20 (25%) 0.774
Diabetes, n (%) 2 (2.6%) 0 (0%) 0.242
History of smoking, n (%) 16 (20.5%) 13 (16.2%) 0.627
Alcohol, drinks/week 3 [0-7.8] 5.5 [1.8-12] 0.057
Exercise Capacity
VO₂peak, mL/kg/min 43.9 ± 10.2 44.1 ± 14.3 0.921
VO₂peak, L/min 3570.1 ± 888.2 3590.7 ± 961.9 0.888
% of predicted exercise capacity 124.3 ± 17.9 123.8 ± 23.2 0.875
Respiratory exchange ratio 1.3 ± 0.1 1.3 ± 0.1 0.179
Total weekly MET-hours 78 [56-94] 71 [44-94] 0.391
Years of endurance training 26 [16.5-36] 22 [15-32] 0.421
Data presentation and statistical methods as per Supplementary Table 1. Cases and controls were matched 1:1 on age (±5 years) and sex.
Abbreviations: As per Supplementary Table 1; AF, atrial fibrillation.

10.6 Supplementary Table 6

Supplementary Table 6. Age- and Sex-Matched Analysis: Cardiac Parameters (n=160)
Characteristic AF (n=80) No AF (n=80) p value
Left ventricular parameters
LVEDV, mL 162.7 ± 39.8 166.4 ± 44.1 0.585
LVESV, mL 65.8 ± 17.3 69 ± 21.7 0.303
LVEF, % 58.4 ± 4.8 57.4 ± 5.3 0.213
LV strain, % -18.5 ± 2.2 -18.8 ± 2.1 0.387
Left atrial parameters
LAESV, mL 89.5 [76.8-111.8] 84 [68.5-97] 0.010
LAVi, mL/m² 45.8 [38.1-54.6] 41.9 [32.7-47.9] 0.003
Mildly abnormal >34 mL/m², n (%) 72 (90%) 57 (71.2%) 0.005
Severely abnormal >48 mL/m², n (%) 32 (40%) 20 (25%) 0.063
Extremely abnormal >60 mL/m², n (%) 12 (15%) 1 (1.2%) 0.002
LA:LV 0.56 [0.48-0.71] 0.51 [0.41-0.62] 0.010
Reservoir strain, % 27.1 ± 8 29.5 ± 5.9 0.047
Conduit strain, % 16 ± 6.3 16.3 ± 6 0.763
Contractile strain, % 11.2 ± 4.7 13.4 ± 5.1 0.008
Right Ventricular Parameters
RVEDV, mL 147.7 ± 43.5 144 ± 44 0.595
RVESV, mL 80.3 ± 27.2 76.9 ± 25.9 0.433
RVEF, % 45.7 ± 8.5 46.3 ± 8.6 0.688
Doppler Parameters
E velocity, m/s 0.6 ± 0.2 0.6 ± 0.2 0.290
A velocity, m/s 0.4 ± 0.1 0.5 ± 0.1 0.194
E/A ratio 1.6 ± 1.4 1.4 ± 0.6 0.271
E′ velocity, cm/s 9.6 ± 2.8 8.7 ± 2.4 0.034
E/E’ ratio 5.9 ± 1.9 6.1 ± 1.7 0.680
Electrocardiographic
P wave duration, ms 124 [112-131] 120 [109.8-128] 0.246
P-wave duration >120 ms, n (%) 45 (60.8%) 41 (51.2%) 0.302
P wave axis, ° 50.3 ± 33.8 50 ± 22.9 0.953
PTFV₁, mV·ms 2.4 [1.2-4.1] 2 [0.9-3] 0.020
PTFV₁ ≥4 mV·ms, n (%) 21 (28.4%) 6 (7.5%) 0.001
Advanced IAB, n (%) 11 (15.3%) 13 (16.2%) 1.000
PACs per 24 h 107 [20-480] 25.5 [9-73.5] <0.001
≥100 PACs per 24 h, n (%) 38 (54.3%) 16 (20.5%) <0.001
Biochemical
BNP, pg/mL 22.5 [11.2-52.8] 15.8 [7.3-29.1] 0.005
cTnI, ng/L 4 [2-7] 3 [2-5] 0.012
Data presentation, statistical methods and abbreviations as per Supplementary Table 2.

For Tables 7 & 8 (Genetic Analysis):

# Filter extreme PRS groups
high_prs <- Athletes_NoCM %>% filter(!is.na(`AF decile`) & `AF decile` == 10)
low_prs <- Athletes_NoCM %>% filter(!is.na(`AF decile`) & `AF decile` == 1)

if (nrow(high_prs) > 0 && nrow(low_prs) > 0) {
  
  # Build comparison table
  supp_prs <- data.frame(Variable = character(), stringsAsFactors = FALSE)
  prs_combined <- rbind(
    high_prs %>% mutate(PRS_group = "High"), 
    low_prs %>% mutate(PRS_group = "Low")
  )
  
  for (var in all_variables) {
    if (!var %in% names(high_prs) || !var %in% names(low_prs)) next
    
    is_cat <- var %in% categorical_vars
    is_nonnorm <- var %in% nonnormal_vars
    
    supp_prs <- rbind(supp_prs, data.frame(
      Variable = var,
      High = format_stats(high_prs, var, is_cat, is_nonnorm),
      Low = format_stats(low_prs, var, is_cat, is_nonnorm),
      P = format_p_value(calculate_p_value(prs_combined, var, "PRS_group", is_cat, is_nonnorm)),
      stringsAsFactors = FALSE
    ))
  }
  
  # Assign categories and clean names
  supp_prs$Category <- sapply(supp_prs$Variable, assign_category)
  supp_prs$Variable <- sapply(supp_prs$Variable, clean_variable_names)
  
  # Order variables the same way as other tables
  supp_prs$Variable <- factor(supp_prs$Variable, levels = variable_order)
  supp_prs <- supp_prs %>% arrange(Variable)
  supp_prs$Variable <- as.character(supp_prs$Variable)
  
  # Table 7: Demographics and Exercise Capacity
  table7_cats <- c("Demographics", "Exercise Capacity")
  supp_7_cat <- supp_prs %>% 
    filter(Category %in% table7_cats) %>%
    mutate(Category = factor(Category, levels = table7_cats))
  
  # Table 8: Cardiac Parameters with same structure as Tables 2 and 6
  table8_cat_order <- c("Left ventricular parameters", "Left atrial parameters",
                        "Right Ventricular Parameters", "Doppler Parameters",
                        "Electrocardiographic", "Biochemical")
  
  supp_8_cat <- supp_prs %>% 
    filter(Category %in% table8_cat_order) %>%
    mutate(Category = factor(Category, levels = table8_cat_order))
  
  # Remove category column for display
  supp_7 <- supp_7_cat %>% select(-Category)
  supp_8 <- supp_8_cat %>% select(-Category)
  
  # Rename columns
  colnames(supp_7) <- c("Characteristic", 
                        paste0("AF-PRS ≥90th percentile (n=", nrow(high_prs), ")"),
                        paste0("AF-PRS <10th percentile (n=", nrow(low_prs), ")"), 
                        "p value")
  colnames(supp_8) <- colnames(supp_7)
  
  # Display tables
  cat("\n## Supplementary Table 7\n\n")
  print(kable(supp_7, 
              caption = "Supplementary Table 7. Extreme Genetic Risk Analysis: Baseline Characteristics",
              align = c("l", "c", "c", "c")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                  full_width = TRUE, font_size = 10) %>%
    pack_rows(index = table(supp_7_cat$Category)[table7_cats]) %>%
    add_footnote(c("Comparison of athletes in highest (≥90th percentile) versus lowest (<10th percentile) deciles of AF-PRS. Data presentation and statistical methods as per Supplementary Table 1.\nAbbreviations: As per Supplementary Table 1; AF, atrial fibrillation; PRS, polygenic risk score."),
                 notation = "none"))
  
  cat("\n## Supplementary Table 8\n\n")
  print(kable(supp_8, 
              caption = "Supplementary Table 8. Extreme Genetic Risk Analysis: Cardiac Parameters",
              align = c("l", "c", "c", "c")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                  full_width = TRUE, font_size = 10) %>%
    pack_rows(index = table(supp_8_cat$Category)[table8_cat_order]) %>%
    add_footnote(c("Data presentation and statistical methods as per Supplementary Table 2.\nAbbreviations: As per Supplementary Table 2."),
                 notation = "none"))
  
} else {
  cat("Insufficient PRS data for extreme risk analysis\n")
}

10.7 Supplementary Table 7

Supplementary Table 7. Extreme Genetic Risk Analysis: Baseline Characteristics
Characteristic AF-PRS ≥90th percentile (n=48) AF-PRS <10th percentile (n=29) p value
Demographics
Age, years 45.5 [22.2-65.2] 42 [22-59] 0.666
Male sex, n (%) 40 (83.3%) 24 (82.8%) 1.000
BMI, kg/m² 24.2 [22.4-26.4] 24 [22.3-26.3] 0.785
BSA, m² 2 [1.9-2.1] 2 [1.8-2.1] 0.829
Average HR, bpm 67.9 ± 10.4 64.8 ± 7.4 0.153
Minimum HR, bpm 45 [40.2-49] 45 [39.5-48.5] 0.575
Bradycardia burden, % (24h) 2.1 [0.4-14] 4.5 [0.3-26] 0.351
Hypertension, n (%) 6 (13.3%) 2 (7.1%) 0.702
SBP, mmHg 130 [120.5-135] 123 [118-131] 0.161
DBP, mmHg 74 [62.5-79] 71 [62-76] 0.401
Dyslipidaemia, n (%) 7 (15.6%) 5 (17.9%) 1.000
Diabetes, n (%) 1 (2.2%) 0 (0%) 1.000
History of smoking, n (%) 6 (13.3%) 3 (10.7%) 1.000
Alcohol, drinks/week 0 [0-3] 3 [0-8] 0.132
Exercise Capacity
VO₂peak, mL/kg/min 45.5 ± 15 47.1 ± 13.2 0.624
VO₂peak, L/min 3548.8 ± 1089.1 3633.6 ± 988.2 0.727
% of predicted exercise capacity 119.6 ± 19.9 121.2 ± 20.7 0.750
Respiratory exchange ratio 1.3 ± 0.1 1.3 ± 0.1 0.291
Total weekly MET-hours 77 [46-105.8] 84 [58-104] 0.874
Years of endurance training 20 [9.2-30] 19 [15-20] 0.994
Comparison of athletes in highest (≥90th percentile) versus lowest (<10th percentile) deciles of AF-PRS. Data presentation and statistical methods as per Supplementary Table 1.
Abbreviations: As per Supplementary Table 1; AF, atrial fibrillation; PRS, polygenic risk score.

10.8 Supplementary Table 8

Supplementary Table 8. Extreme Genetic Risk Analysis: Cardiac Parameters
Characteristic AF-PRS ≥90th percentile (n=48) AF-PRS <10th percentile (n=29) p value
Left ventricular parameters
LVEDV, mL 160.9 ± 45.4 164.2 ± 32.6 0.714
LVESV, mL 68 ± 21.8 65 ± 17.5 0.518
LVEF, % 56.6 ± 6.1 58.9 ± 5.4 0.108
LV strain, % -18.2 ± 2.6 -18.7 ± 2.1 0.412
Left atrial parameters
LAESV, mL 84.5 [71-98.2] 81 [72-95] 0.829
LAVi, mL/m² 43.6 [35.8-49.1] 42.7 [35.2-48.6] 0.946
Mildly abnormal >34 mL/m², n (%) 38 (79.2%) 22 (75.9%) 0.956
Severely abnormal >48 mL/m², n (%) 14 (29.2%) 8 (27.6%) 1.000
Extremely abnormal >60 mL/m², n (%) 4 (8.3%) 1 (3.4%) 0.645
LA:LV 0.53 [0.43-0.69] 0.49 [0.43-0.64] 0.444
Reservoir strain, % 28.8 ± 8.9 31 ± 7.3 0.262
Conduit strain, % 18.1 ± 6.7 19.5 ± 7.6 0.458
Contractile strain, % 10.7 ± 5.8 11.6 ± 4.8 0.496
Right Ventricular Parameters
RVEDV, mL 145.2 ± 40.9 131.6 ± 33.2 0.120
RVESV, mL 77.3 ± 20.9 68.7 ± 16.3 0.050
RVEF, % 45.8 ± 9.2 46.8 ± 9.8 0.666
Doppler Parameters
E velocity, m/s 0.7 ± 0.2 0.7 ± 0.2 0.321
A velocity, m/s 0.4 ± 0.1 0.4 ± 0.1 0.733
E/A ratio 1.7 ± 0.8 1.8 ± 1 0.813
E′ velocity, cm/s 10.2 ± 3.3 10 ± 3.2 0.783
E/E’ ratio 6 ± 1.6 5.7 ± 1.4 0.465
Electrocardiographic
P wave duration, ms 114 [104-129] 118 [110-125] 0.991
P-wave duration >120 ms, n (%) 20 (46.5%) 12 (41.4%) 0.851
P wave axis, ° 56.1 ± 21 49.7 ± 31.8 0.359
PTFV₁, mV·ms 2 [1.2-3.4] 1.4 [0-2.8] 0.107
PTFV₁ ≥4 mV·ms, n (%) 9 (20.9%) 4 (13.8%) 0.541
Advanced IAB, n (%) 5 (12.2%) 0 (0%) 0.075
PACs per 24 h 34.5 [11.8-109.8] 16.5 [6-67.5] 0.127
≥100 PACs per 24 h, n (%) 12 (27.3%) 6 (22.2%) 0.846
Biochemical
BNP, pg/mL 12.6 [9.4-34.7] 12.4 [5.7-30.1] 0.438
cTnI, ng/L 3 [2-9] 2 [2-4] 0.489
Data presentation and statistical methods as per Supplementary Table 2.
Abbreviations: As per Supplementary Table 2.

11 Supplementary Figures

11.1 Supplementary Figure 1

Supplementary Figure 1. Distribution of Clinical Risk Scores by AF Status

if (!exists("cutoffs")) {
  prob_thresholds <- c(0.10, 0.40, 0.70)
  cutoffs <- round(map_dbl(prob_thresholds, 
    ~min(Athletes_analysis$clinical_score_B[Athletes_analysis$predicted_prob_B >= .x], na.rm = TRUE)))
}

ggplot(Athletes_analysis, aes(x = clinical_score_B_rounded,
                              fill = factor(AF, labels = c("No AF", "AF")))) +
  geom_histogram(binwidth = 1, position = "dodge", 
                 alpha = 0.8, color = "black") +
  # Use dynamic cutoffs instead of hardcoded values
  geom_segment(aes(x = cutoffs[1], xend = cutoffs[1], y = 0, yend = Inf),
               linetype = "dashed", color = "black", linewidth = 0.8) +
  geom_segment(aes(x = cutoffs[2], xend = cutoffs[2], y = 0, yend = Inf),
               linetype = "dashed", color = "black", linewidth = 0.8) +
  geom_segment(aes(x = cutoffs[3], xend = cutoffs[3], y = 0, yend = Inf),
               linetype = "dashed", color = "black", linewidth = 0.8) +
  annotate("text", 
           x = c(cutoffs[1]/2, 
                 mean(c(cutoffs[1], cutoffs[2])), 
                 mean(c(cutoffs[2], cutoffs[3])), 
                 cutoffs[3] + 3), 
           y = -2, 
           label = c("Low", "Moderate", "High", "Very High"),
           size = 3.5, fontface = "italic") +
  
  scale_fill_manual(values = c("No AF" = "#0072B2", "AF" = "#990000")) +
  scale_y_continuous(expand = expansion(mult = c(0.08, 0.12))) +
  labs(x = "Clinical Risk Score",
       y = "Number of Athletes",
       fill = "AF Status") +
  coord_cartesian(clip = "off") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom",
        plot.margin = margin(t = 10, r = 10, b = 25, l = 10))
Legend = Distribution of Clinical Risk Scores by AF Status

Legend = Distribution of Clinical Risk Scores by AF Status

11.2 Supplementary Figure 2

Supplementary Figure 2. AF prevalence by risk category

risk_prev <- Athletes_analysis %>%
  group_by(risk_category) %>%
  summarise(
    N = n(),
    AF_Cases = sum(AF),
    AF_Prevalence = mean(AF) * 100,
    CI_lower = binom.test(AF_Cases, N)$conf.int[1] * 100,
    CI_upper = binom.test(AF_Cases, N)$conf.int[2] * 100,
    .groups = "drop"
  )

ggplot(risk_prev, aes(x = risk_category, y = AF_Prevalence)) +
  geom_col(fill = "#6a994e", alpha = 0.8) +
  geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper),
                width = 0.2, size = 0.6) +
  geom_text(aes(y = CI_upper, label = paste0(round(AF_Prevalence, 1), "%")),
            vjust = -2.5, size = 4, color = "black") +
  geom_text(aes(y = CI_upper, label = paste0("(", AF_Cases, "/", N, ")")),
            vjust = -1.0, size = 4, color = "black") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(x = "Risk Category",
       y = "AF Prevalence (%)") +
  theme_minimal(base_size = 14) +
  theme(panel.grid.major.x = element_blank())

11.3 Supplementary Figure 3

Supplementary Figure 3. Profile of Atrial Remodelling Markers by Clinical Risk Category

if (!"risk_category" %in% names(Athletes_analysis)) {
  
  # Define the probability thresholds
  prob_thresholds <- c(0.10, 0.40, 0.70)
  
  # Calculate the corresponding clinical score cutoffs dynamically
  cutoffs <- round(purrr::map_dbl(prob_thresholds, 
      ~min(Athletes_analysis$clinical_score_B[Athletes_analysis$predicted_prob_B >= .x], na.rm = TRUE)))
      
  # Define the breaks for the cut function
  score_breaks <- c(-Inf, cutoffs, Inf)

  # Create the risk categories using the rounded integer score for consistency
  Athletes_analysis <- Athletes_analysis %>%
    mutate(
      clinical_score_B_rounded = round(clinical_score_B),
      risk_category = cut(clinical_score_B_rounded, breaks = score_breaks,
                          labels = c("Low", "Moderate", "High", "Very High"),
                          include.lowest = TRUE)
    )
}

# Prepare radar chart data with Age >45 as binary prevalence
radar_data_C <- Athletes_analysis %>%
  group_by(risk_category) %>%
  summarise(
    `Age >45 years (%)` = mean(Age > 45, na.rm = TRUE) * 100,
    `SBP ≥130 mmHg (%)` = mean(High_normal_BP, na.rm = TRUE) * 100,
    `LAVi 35-60 mL/m² (%)` = mean(LAVi_34, na.rm = TRUE) * 100,
    `LAVi >60 mL/m² (%)` = mean(LAVi_60, na.rm = TRUE) * 100,
    `P wave >120ms (%)` = mean(P_wave_120, na.rm = TRUE) * 100,
    `PTFV1 ≥4.0 mV·ms (%)` = mean(PTFV1_high, na.rm = TRUE) * 100,
    `PACs ≥100/24h (%)` = mean(PAC_100, na.rm = TRUE) * 100,
    .groups = "drop"
  )

# Convert to data frame with row names
radar_df <- as.data.frame(radar_data_C[, -1])
rownames(radar_df) <- radar_data_C$risk_category

# Add max and min rows for fmsb package (all variables now 0-100%)
max_row <- rep(100, ncol(radar_df))
min_row <- rep(0, ncol(radar_df))
radar_chart_data <- rbind(max_row, min_row, radar_df)

# Define colors for each risk category
colors_border <- c("#0072B2", "#67A9CF", "#F4A582", "#C00000")
colors_fill <- adjustcolor(colors_border, alpha.f = 0.3)

# Set up 2x2 plotting layout
par(mfrow = c(2, 2), mar = c(1, 1, 3, 1))

# Create radar chart for each risk category
for (i in 1:4) {
  radarchart(
    radar_chart_data[c(1, 2, i + 2), ],
    axistype = 1,
    pcol = colors_border[i],
    pfcol = colors_fill[i],
    plwd = 2,
    cglcol = "grey",
    cglty = 1,
    cglwd = 0.8,
    axislabcol = "grey",
    vlcex = 0.9,
    title = rownames(radar_chart_data)[i + 2],
    title.col = "black",
    cex.main = 1.5
  )
}

# Reset plotting parameters
par(mfrow = c(1, 1))

11.4 Supplementary Figure 4

Supplementary Figure 4. Clinical risk score cutoffs corresponding to predicted AF probability thresholds

# Define the probability thresholds used for risk stratification in Table 4
prob_thresholds <- c(0.10, 0.40, 0.70)

# Calculate the corresponding clinical score cutoffs dynamically
# This ensures the vertical lines in the plot match the table's risk categories
cutoffs <- round(purrr::map_dbl(prob_thresholds, 
    ~min(Athletes_analysis$clinical_score_B[Athletes_analysis$predicted_prob_B >= .x], na.rm = TRUE)))

# Generate the plot
ggplot(Athletes_analysis, aes(x = clinical_score_B, 
                             y = predicted_prob_B * 100)) +
  geom_point(aes(color = factor(AF, labels = c("No AF", "AF"))), 
             alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE, color = "black") +
  # Use the thresholds variable for the horizontal lines
  geom_hline(yintercept = prob_thresholds * 100, 
             linetype = "dashed", alpha = 0.5) +
  # Use the calculated cutoffs variable for the vertical lines
  geom_vline(xintercept = cutoffs, 
             linetype = "dashed", alpha = 0.5) +
  scale_color_manual(values = c("No AF" = "#0072B2", "AF" = "#990000")) +
  labs(x = "Clinical Risk Score",
       y = "Predicted AF Probability (%)",
       color = "AF Status") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom")
Legend: Predicted AF probabilities were stratified into four probability thresholds: <10%, 10-40%, 40-70%, and >70%.

Legend: Predicted AF probabilities were stratified into four probability thresholds: <10%, 10-40%, 40-70%, and >70%.