1 Setup

2 1. Data Preparation

# Load data
data <- read_excel("data/patients.xlsx")

# Clean column names
data_clean <- data %>% janitor::clean_names()

# Filter out IEC and AUTO (keep only allo)
data_clean <- subset(data_clean, !(type_at_bmt %in% c("IEC", "AUTO")))

# Recode 1-year outcome
data_clean <- data_clean %>%
  mutate(alive1year_recoded = ifelse(alive1year == "Yes", 1,
                              ifelse(alive1year == "No", 0, NA)))

# Keep only evaluable patients
data_1yr <- subset(data_clean, !is.na(alive1year_recoded))

head(data_1yr)
## # A tibble: 6 × 44
##   tx_date             age_group muh_number full_name   date_of_birth       sex  
##   <dttm>              <chr>     <chr>      <chr>       <dttm>              <chr>
## 1 2022-01-12 00:00:00 Adult     7756425    Bellamy, J… 1949-05-14 00:00:00 Fema…
## 2 2022-01-14 00:00:00 Adult     7774482    Pitt, Brys… 1998-05-11 00:00:00 Male 
## 3 2022-01-21 00:00:00 Adult     953678     Klecha, Mi… 1951-11-02 00:00:00 Fema…
## 4 2022-01-25 00:00:00 Adult     7750707    Brentin, R… 1954-04-24 00:00:00 Male 
## 5 2022-01-28 00:00:00 Adult     5842909    Smith, Mat… 1959-12-11 00:00:00 Male 
## 6 2022-02-23 00:00:00 Adult     7451324    Gardner, J… 1951-05-08 00:00:00 Male 
## # ℹ 38 more variables: diagnosis <chr>, subdiagnosis <chr>, bmt_status <chr>,
## #   type_at_bmt <chr>, sub_type_at_bmt <chr>, date_of_dx <dttm>, outpt <lgl>,
## #   conditioning_at_bmt <chr>, protocol_at_bmt <chr>, status_at_tx <chr>,
## #   current_status_of_disease <chr>, anc_date <dttm>, anc_comment <chr>,
## #   platelet_date <dttm>, platelet_comment_50 <chr>, rfi_classification <chr>,
## #   hla <chr>, acute_gvhd <lgl>, acute_gvhd_peak <dbl>, chronic_gvhd <lgl>,
## #   chronic_gvhd_peak <chr>, cmv_pt <lgl>, cmv_donor <lgl>, …

3 2. Age Analysis

3.1 2.1 Histogram of Age

ggplot(data_clean, aes(x = age_at_tx)) +
  geom_histogram(binwidth = 5, fill = "orange", color = "black") +
  labs(title = "Age Distribution (Allo Patients)",
       x = "Age at Transplant", y = "Count")

3.2 2.2 Age vs Outcome at 1 Year

p_outcome_age <- ggplot(data_1yr,
                        aes(x=factor(alive1year_recoded),
                            y=age_at_tx,
                            fill=factor(alive1year_recoded))) +
  geom_boxplot() +
  stat_compare_means(method="wilcox.test",
                     label.y=max(data_1yr$age_at_tx,na.rm=TRUE)+2) +
  labs(title="Age vs Alive at 1 Year",
       x="1-Year Outcome (0=No, 1=Yes)",
       y="Age at Transplant")
p_outcome_age

wilcox.test(age_at_tx ~ alive1year_recoded, data=data_1yr)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age_at_tx by alive1year_recoded
## W = 2114, p-value = 0.1801
## alternative hypothesis: true location shift is not equal to 0

4 3. CMI Analysis

4.1 3.1 Distribution of CMI

cmi_totals <- data_1yr %>%
  filter(!is.na(cmi)) %>%
  group_by(cmi) %>%
  summarise(total_n=n(),.groups="drop")

ggplot(cmi_totals,aes(x=factor(cmi),y=total_n))+
  geom_col(fill="steelblue")+
  geom_text(aes(label=paste0("n=",total_n)),vjust=-0.3)+
  labs(title="Patients by CMI",x="CMI Score",y="N")

4.2 3.2 1-Year Outcome by CMI

ggplot(subset(data_1yr,!is.na(cmi)),
       aes(x=factor(cmi),fill=factor(alive1year_recoded)))+
  geom_bar(position="fill")+
  geom_text(stat="count",aes(label=paste0("n=",..count..)),
            position=position_fill(vjust=0.5),color="white")+
  scale_fill_manual(values=c("0"="tomato","1"="seagreen"),
                    labels=c("0"="No","1"="Yes"))+
  labs(title="1-Year Outcome by CMI",
       x="CMI",y="Proportion",fill="Alive at 1y")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.


5 4. Combined Risk Score Analysis

data_1yr <- data_1yr %>%
  filter(!is.na(age_at_tx),!is.na(cmi)) %>%
  mutate(risk_score=age_at_tx+5*cmi)

roc_score <- roc(data_1yr$alive1year_recoded ~ data_1yr$risk_score)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc(roc_score)
## Area under the curve: 0.6099
plot(roc_score,col="purple",lwd=2,main="ROC: Combined Risk Score")
abline(a=0,b=1,lty=2,col="grey")

best_cut <- coords(roc_score,"best",
                   ret=c("threshold","sensitivity","specificity"))
best_cut
##   threshold sensitivity specificity
## 1  67.06108   0.4672897   0.8064516

6 5. Survival Analyses (Kaplan–Meier)

6.1 5.1 Prepare Data

# Prepare survival dataset
surv_data <- data_clean %>%
  filter(!is.na(tx_date), !is.na(age_at_tx)) %>%
  mutate(
    tx_date = as.Date(tx_date, format = "%m/%d/%Y"),
    death_date = as.Date(dod, format = "%m/%d/%Y"),
    
    # Event times (days and months, capped at 12 months)
    time_days = ifelse(!is.na(death_date),
                       as.numeric(difftime(death_date, tx_date, units = "days")),
                       365),
    time_months = pmin(time_days / 30.44, 12),
    
    # Event indicator
    event = ifelse(!is.na(death_date) & time_days <= 365, 1, 0),
    
    # Risk score
    risk_score = age_at_tx + 5 * cmi,
    
    # Risk groups
    risk67 = ifelse(risk_score >= 67, "High (≥67)", "Low (<67)"),
    risk80 = ifelse(risk_score >= 80, "High (≥80)", "Low (<80)")
  ) %>%
  mutate(
    # Age groups (3 bins)
    age_group3 = case_when(
      age_at_tx < 65 ~ "<65",
      age_at_tx >= 65 & age_at_tx < 75 ~ "65–74",
      age_at_tx >= 75 ~ "≥75"
    ),
    # CMI groups (3 bins)
    cmi_group3 = case_when(
      cmi < 3 ~ "<3",
      cmi >= 3 & cmi < 6 ~ "3–6",
      cmi >= 6 ~ "≥6"
    )
  )

# Factor ordering
surv_data$age_group3 <- factor(surv_data$age_group3,
                               levels = c("<65", "65–74", "≥75"))
surv_data$cmi_group3 <- factor(surv_data$cmi_group3,
                               levels = c("<3", "3–6", "≥6"))

# Survival object
surv_obj <- Surv(time = surv_data$time_months, event = surv_data$event)

6.2 5.2 Overall Survival

fit_overall <- survfit(surv_obj~1,data=surv_data)
summary(fit_overall,times=12)
## Call: survfit(formula = surv_obj ~ 1, data = surv_data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     14      33    0.835  0.0262        0.785        0.888
ggsurvplot(fit_overall,conf.int=TRUE,risk.table=TRUE,
           xlab="Months",ylab="Survival Probability",
           title="Overall Survival (1 Year)",
           break.time.by=3,xlim=c(0,12))

6.3 5.3 By Age

fit_age3 <- survfit(surv_obj ~ age_group3, data = surv_data)
summary(fit_age3, times = 12)
## Call: survfit(formula = surv_obj ~ age_group3, data = surv_data)
## 
##                 age_group3=<65 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       8.0000      14.0000       0.8772       0.0307       0.8190 
## upper 95% CI 
##       0.9396 
## 
##                 age_group3=65–74 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       4.0000      19.0000       0.7595       0.0481       0.6709 
## upper 95% CI 
##       0.8598 
## 
##                 age_group3=≥75 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##           12            2            0            1            0            1 
## upper 95% CI 
##            1
ggsurvplot(fit_age3,
           conf.int = FALSE, pval = TRUE, risk.table = TRUE,
           xlab = "Months since Transplant",
           ylab = "Survival Probability",
           title = "Survival by Age Groups (<65, 65–74, ≥75)",
           legend.title = "Age Group",
           break.time.by = 3, xlim = c(0,12))

6.4 5.4 By CMI

fit_cmi3 <- survfit(surv_obj ~ cmi_group3, data = surv_data)
summary(fit_cmi3, times = 12)
## Call: survfit(formula = surv_obj ~ cmi_group3, data = surv_data)
## 
## 11 observations deleted due to missingness 
##                 cmi_group3=<3 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       4.0000      15.0000       0.8718       0.0309       0.8133 
## upper 95% CI 
##       0.9345 
## 
##                 cmi_group3=3–6 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       6.0000      15.0000       0.7619       0.0537       0.6637 
## upper 95% CI 
##       0.8747 
## 
##                 cmi_group3=≥6 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##       12.000        3.000        1.000        0.889        0.105        0.706 
## upper 95% CI 
##        1.000
ggsurvplot(fit_cmi3,
           conf.int = FALSE, pval = TRUE, risk.table = TRUE,
           xlab = "Months since Transplant",
           ylab = "Survival Probability",
           title = "Survival by CMI Groups (<3, 3–6, ≥6)",
           legend.title = "CMI Group",
           break.time.by = 3, xlim = c(0,12))

6.5 5.5 By Risk Score ≥67

fit_risk67 <- survfit(surv_obj~risk67,data=surv_data)
summary(fit_risk67,times=12)
## Call: survfit(formula = surv_obj ~ risk67, data = surv_data)
## 
## 11 observations deleted due to missingness 
##                 risk67=High (≥67) 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000      11.0000      25.0000       0.7706       0.0403       0.6956 
## upper 95% CI 
##       0.8538 
## 
##                 risk67=Low (<67) 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       2.0000       6.0000       0.9250       0.0294       0.8690 
## upper 95% CI 
##       0.9846
ggsurvplot(fit_risk67,conf.int=FALSE,pval=TRUE,risk.table=TRUE,
           xlab="Months",ylab="Survival",
           title="Survival by Risk Score ≥67",
           legend.title="Risk Group",
           break.time.by=3,xlim=c(0,12))

6.6 5.6 By Risk Score ≥80

fit_risk80 <- survfit(surv_obj~risk80,data=surv_data)
summary(fit_risk80,times=12)
## Call: survfit(formula = surv_obj ~ risk80, data = surv_data)
## 
## 11 observations deleted due to missingness 
##                 risk80=High (≥80) 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       8.0000      13.0000       0.7679       0.0564       0.6649 
## upper 95% CI 
##       0.8868 
## 
##                 risk80=Low (<80) 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      12.0000       5.0000      18.0000       0.8647       0.0297       0.8084 
## upper 95% CI 
##       0.9248
ggsurvplot(fit_risk80,conf.int=FALSE,pval=TRUE,risk.table=TRUE,
           xlab="Months",ylab="Survival",
           title="Survival by Risk Score ≥80",
           legend.title="Risk Group",
           break.time.by=3,xlim=c(0,12))


7 6. Length of Stay (LOS) Analysis

7.1 6.1 Create LOS Dataset

data_los <- data_clean %>%
  filter(!is.na(tx_date), !is.na(discharge_date),
         !is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(tx_date=as.Date(tx_date,format="%m/%d/%Y"),
         discharge_date=as.Date(discharge_date,format="%m/%d/%Y"),
         LOS_days=as.numeric(difftime(discharge_date,tx_date,units="days")),
         risk_score=age_at_tx+5*cmi,
         LongStay=ifelse(LOS_days>30,1,0))

summary(data_los$LOS_days)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   18.00   20.00   24.83   26.00  100.00
table(data_los$LongStay)
## 
##   0   1 
## 154  30

7.2 6.2 Distribution of LOS

ggplot(data_los, aes(x = LOS_days)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of LOS",
       x = "LOS (days)", y = "N")

7.3 6.3 Regression of LOS on Age + CMI

fit_los <- lm(LOS_days ~ age_at_tx + cmi, data=data_los)
summary(fit_los)
## 
## Call:
## lm(formula = LOS_days ~ age_at_tx + cmi, data = data_los)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.358  -6.517  -4.110   0.696  73.782 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.661643   3.785091   5.987 1.12e-08 ***
## age_at_tx    0.008531   0.063175   0.135    0.893    
## cmi          0.812862   0.516829   1.573    0.118    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.04 on 181 degrees of freedom
## Multiple R-squared:  0.01392,    Adjusted R-squared:  0.003026 
## F-statistic: 1.278 on 2 and 181 DF,  p-value: 0.2812

7.4 6.4 LOS by Age Decade

data_los <- data_los %>%
  mutate(age_decade=cut(age_at_tx,breaks=seq(0,100,10),right=FALSE))

ggplot(data_los, aes(x=age_decade,y=LOS_days))+
  geom_boxplot(fill="skyblue",alpha=0.7)+
  labs(title="LOS by Age Decade",x="Age Group",y="LOS (days)")

7.5 6.5 ROC Curve for LOS >30 Days

roc_los <- roc(LongStay ~ risk_score, data=data_los)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_los)
## Area under the curve: 0.547
plot(roc_los,col="darkblue",lwd=2,
     main="ROC: Risk Score Predicting LOS >30 Days")
abline(a=0,b=1,lty=2,col="grey")

best_cut_los <- coords(roc_los,"best",
                       ret=c("threshold","sensitivity","specificity"))
best_cut_los
##   threshold sensitivity specificity
## 1  67.65443         0.7   0.4675325

7.6 6.6 Outcome by Fixed Cutoff (≥80)

data_los <- data_los %>%
  mutate(RiskGroup80=ifelse(risk_score>=80,"High (≥80)","Low (<80)"))

tbl_los80 <- table(data_los$RiskGroup80,data_los$LongStay)
tbl_los80
##             
##                0   1
##   High (≥80)  44   9
##   Low (<80)  110  21
if (nrow(tbl_los80)==2 && ncol(tbl_los80)==2) {
  fisher.test(tbl_los80)
}
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tbl_los80
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.3739103 2.5034470
## sample estimates:
## odds ratio 
##  0.9336884
ggplot(data_los, aes(x=RiskGroup80, fill=factor(LongStay)))+
  geom_bar(position="fill")+
  scale_fill_manual(values=c("0"="seagreen","1"="tomato"),
                    labels=c("0"="≤30 days","1"=">30 days"))+
  labs(title="LOS >30 Days by Risk Group (Cutoff=80)",
       x="Risk Group",y="Proportion",fill="LOS Outcome")+
  theme_minimal()

8 Identifying Patients for Geriatrics Evaluation (Capacity ≈15 per Year)

We worked backwards from a capacity of ~15 patients per year to determine a cutoff on the combined RiskScore = Age + 5×CMI.


8.1 Calculate RiskScore and Test Cutoffs

# Build scored dataset
data_scored <- data_clean %>%
  filter(!is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(RiskScore = age_at_tx + 5 * cmi,
         Tx_Year = format(as.Date(tx_date), "%Y"))

# Candidate cutoffs from 60–120
score_cutoffs <- seq(60, 120, by = 5)

# Count how many patients per year exceed each cutoff
threshold_results <- lapply(score_cutoffs, function(cut) {
  counts <- data_scored %>%
    filter(RiskScore >= cut) %>%
    group_by(Tx_Year) %>%
    summarise(n_selected = n(), .groups = "drop") %>%
    summarise(mean_per_year = mean(n_selected))
  
  data.frame(Cutoff = cut, MeanPerYear = counts$mean_per_year)
})

threshold_results <- do.call(rbind, threshold_results)
threshold_results
##    Cutoff MeanPerYear
## 1      60       33.25
## 2      65       28.75
## 3      70       25.00
## 4      75       18.75
## 5      80       14.00
## 6      85        7.50
## 7      90        3.75
## 8      95        2.25
## 9     100        1.25
## 10    105        1.00
## 11    110        1.00
## 12    115         NaN
## 13    120         NaN

8.2 Plot: Average Patients per Year vs Cutoff

ggplot(threshold_results, aes(x = Cutoff, y = MeanPerYear)) +
  geom_line(color = "darkgreen") +
  geom_point(size = 3, color = "darkgreen") +
  geom_hline(yintercept = 15, linetype = "dashed", color = "red") +
  labs(title = "Average Number of Patients per Year Selected by RiskScore Cutoff",
       x = "RiskScore Cutoff",
       y = "Mean Patients per Year") +
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).


8.3 Scatterplot: Patients Selected (Cutoff Example ≥80)

# Apply cutoff = 80 for illustration
data_scatter <- data_scored %>%
  mutate(Selected = ifelse(RiskScore >= 80, "Selected", "Not Selected"))

# Scatterplot of Age vs CMI, highlighting selected patients
p_risk <- ggplot(data_scatter, aes(x = age_at_tx, y = cmi)) +
  geom_point(aes(color = Selected), alpha = 0.6) +
  scale_color_manual(values = c("Not Selected" = "grey70", "Selected" = "purple")) +
  labs(title = "Age vs CMI: Patients Selected for Geriatrics Evaluation (RiskScore ≥80)",
       x = "Age at Transplant",
       y = "Comorbidity Index (CMI)",
       color = "Selection Status") +
  theme_minimal()

p_risk

# Identifying Patients for Geriatrics Evaluation Using Age Only

We worked backwards from a capacity of ~15 patients per year to determine an age cutoff.


8.4 Calculate Age Cutoffs

# Build dataset with age + year
data_age <- data_clean %>%
  filter(!is.na(age_at_tx)) %>%
  mutate(Tx_Year = format(as.Date(tx_date), "%Y"))

# Candidate age cutoffs from 50–80
age_cutoffs <- seq(50, 80, by = 1)

# Count how many patients per year exceed each cutoff
threshold_age_results <- lapply(age_cutoffs, function(cut) {
  counts <- data_age %>%
    filter(age_at_tx >= cut) %>%
    group_by(Tx_Year) %>%
    summarise(n_selected = n(), .groups = "drop") %>%
    summarise(mean_per_year = mean(n_selected))
  
  data.frame(Cutoff = cut, MeanPerYear = counts$mean_per_year)
})

threshold_age_results <- do.call(rbind, threshold_age_results)
threshold_age_results
##    Cutoff MeanPerYear
## 1      50       35.00
## 2      51       34.50
## 3      52       34.25
## 4      53       33.50
## 5      54       33.00
## 6      55       32.50
## 7      56       31.50
## 8      57       30.75
## 9      58       30.25
## 10     59       29.75
## 11     60       29.25
## 12     61       27.25
## 13     62       25.50
## 14     63       24.50
## 15     64       23.00
## 16     65       21.50
## 17     66       20.00
## 18     67       18.25
## 19     68       16.50
## 20     69       14.75
## 21     70       11.00
## 22     71        7.50
## 23     72        5.75
## 24     73        4.25
## 25     74        3.25
## 26     75        1.75
## 27     76        1.50
## 28     77        1.25
## 29     78        1.00
## 30     79         NaN
## 31     80         NaN

8.5 Plot: Average Patients per Year vs Age Cutoff

ggplot(threshold_age_results, aes(x = Cutoff, y = MeanPerYear)) +
  geom_line(color = "blue") +
  geom_point(size = 3, color = "blue") +
  geom_hline(yintercept = 15, linetype = "dashed", color = "red") +
  labs(title = "Average Number of Patients per Year Selected by Age Cutoff",
       x = "Age Cutoff (years)",
       y = "Mean Patients per Year") +
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).


8.6 Scatterplot: Patients Selected (Cutoff Example ≥70)

# Apply cutoff = 70 for illustration
data_age_scatter <- data_age %>%
  mutate(Selected = ifelse(age_at_tx >= 70, "Selected", "Not Selected"))

# Scatterplot of Age vs CMI, highlighting patients ≥70
p_age <- ggplot(data_age_scatter, aes(x = age_at_tx, y = cmi)) +
  geom_point(aes(color = Selected), alpha = 0.6) +
  scale_color_manual(values = c("Not Selected" = "grey70", "Selected" = "blue")) +
  labs(title = "Patients Selected for Geriatrics Evaluation (Age ≥70)",
       x = "Age at Transplant",
       y = "Comorbidity Index (CMI)",
       color = "Selection Status") +
  theme_minimal()

p_age
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

9 Identifying Patients for Geriatrics Evaluation (Age >70 OR CMI >5)

We tested a combined rule: select patients if age_at_tx > 70 OR cmi > 5.


9.1 Flag Selected Patients

data_combo <- data_clean %>%
  filter(!is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Selected = ifelse(age_at_tx > 70 | cmi > 5, "Selected", "Not Selected")
  )

table(data_combo$Selected)
## 
## Not Selected     Selected 
##          141           48

9.2 Number of Selected Patients per Year

combo_per_year <- data_combo %>%
  group_by(Tx_Year, Selected) %>%
  summarise(n = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Selected, values_from = n, values_fill = 0)

combo_per_year
## # A tibble: 4 × 3
##   Tx_Year `Not Selected` Selected
##   <chr>            <int>    <int>
## 1 2022                34       15
## 2 2023                37       10
## 3 2024                45       15
## 4 2025                25        8

9.3 Plot: Number of Selected Patients per Year

ggplot(combo_per_year, aes(x = Tx_Year, y = Selected)) +
  geom_col(fill = "purple", alpha = 0.7) +
  geom_text(aes(label = Selected), vjust = -0.3, size = 4) +
  labs(title = "Number of Patients Selected (Age >70 OR CMI >5) per Year",
       x = "Transplant Year",
       y = "Number of Selected Patients") +
  theme_minimal()


9.4 Scatterplot: Age vs CMI (Selection Rule Highlighted)

p_combo <- ggplot(data_combo, aes(x = age_at_tx, y = cmi)) +
  geom_point(aes(color = Selected), alpha = 0.6) +
  scale_color_manual(values = c("Not Selected" = "grey70", "Selected" = "purple")) +
  labs(title = "Patients Selected for Geriatrics Evaluation (Age >70 OR CMI >5)",
       x = "Age at Transplant",
       y = "Comorbidity Index (CMI)",
       color = "Selection Status") +
  theme_minimal()

p_combo

10 Number of Patients Selected per Year (Different Rules)

We compared two rules for selecting patients for geriatrics evaluation:
1. Age > 70
2. RiskScore ≥ 80 (Age + 5×CMI)


10.1 Rule 1: Age > 70

selected_age70 <- data_clean %>%
  filter(!is.na(age_at_tx)) %>%
  mutate(
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Selected = ifelse(age_at_tx > 70, "Selected", "Not Selected")
  ) %>%
  group_by(Tx_Year, Selected) %>%
  summarise(n = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Selected, values_from = n, values_fill = 0)

selected_age70
## # A tibble: 4 × 3
##   Tx_Year `Not Selected` Selected
##   <chr>            <int>    <int>
## 1 2022                40       14
## 2 2023                40        8
## 3 2024                47       13
## 4 2025                29        9

10.2 Rule 2: RiskScore ≥ 80

selected_score80 <- data_clean %>%
  filter(!is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(
    RiskScore = age_at_tx + 5 * cmi,
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Selected = ifelse(RiskScore >= 80, "Selected", "Not Selected")
  ) %>%
  group_by(Tx_Year, Selected) %>%
  summarise(n = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Selected, values_from = n, values_fill = 0)

selected_score80
## # A tibble: 4 × 3
##   Tx_Year `Not Selected` Selected
##   <chr>            <int>    <int>
## 1 2022                33       16
## 2 2023                33       14
## 3 2024                45       15
## 4 2025                22       11

11 Number of Patients Selected per Year (Different Rules)

We compared two rules for selecting patients for geriatrics evaluation:
1. Age > 70
2. RiskScore ≥ 80 (Age + 5×CMI)


11.1 Rule 1: Age > 70

selected_age70 <- data_clean %>%
  filter(!is.na(age_at_tx)) %>%
  mutate(
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Selected = ifelse(age_at_tx > 70, "Selected", "Not Selected")
  ) %>%
  group_by(Tx_Year, Selected) %>%
  summarise(n = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Selected, values_from = n, values_fill = 0)

selected_age70
## # A tibble: 4 × 3
##   Tx_Year `Not Selected` Selected
##   <chr>            <int>    <int>
## 1 2022                40       14
## 2 2023                40        8
## 3 2024                47       13
## 4 2025                29        9

11.2 Rule 2: RiskScore ≥ 80

selected_score80 <- data_clean %>%
  filter(!is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(
    RiskScore = age_at_tx + 5 * cmi,
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Selected = ifelse(RiskScore >= 80, "Selected", "Not Selected")
  ) %>%
  group_by(Tx_Year, Selected) %>%
  summarise(n = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Selected, values_from = n, values_fill = 0)

selected_score80
## # A tibble: 4 × 3
##   Tx_Year `Not Selected` Selected
##   <chr>            <int>    <int>
## 1 2022                33       16
## 2 2023                33       14
## 3 2024                45       15
## 4 2025                22       11

11.3 Calculate Selected Patients per Year (All Rules)

# Rule 1: Age >70
selected_age70 <- data_clean %>%
  filter(!is.na(age_at_tx)) %>%
  mutate(
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Rule = "Age >70",
    Selected = ifelse(age_at_tx > 70, 1, 0)
  ) %>%
  group_by(Tx_Year, Rule) %>%
  summarise(n_selected = sum(Selected), .groups = "drop")

# Rule 2: RiskScore ≥80
selected_score80 <- data_clean %>%
  filter(!is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(
    RiskScore = age_at_tx + 5 * cmi,
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Rule = "RiskScore ≥80",
    Selected = ifelse(RiskScore >= 80, 1, 0)
  ) %>%
  group_by(Tx_Year, Rule) %>%
  summarise(n_selected = sum(Selected), .groups = "drop")

# Rule 3: Age >70 OR CMI >5
selected_combo <- data_clean %>%
  filter(!is.na(age_at_tx), !is.na(cmi)) %>%
  mutate(
    Tx_Year = format(as.Date(tx_date), "%Y"),
    Rule = "Age >70 OR CMI >5",
    Selected = ifelse(age_at_tx > 70 | cmi > 5, 1, 0)
  ) %>%
  group_by(Tx_Year, Rule) %>%
  summarise(n_selected = sum(Selected), .groups = "drop")

# Combine all
selected_all <- bind_rows(selected_age70, selected_score80, selected_combo)
selected_all
## # A tibble: 12 × 3
##    Tx_Year Rule              n_selected
##    <chr>   <chr>                  <dbl>
##  1 2022    Age >70                   14
##  2 2023    Age >70                    8
##  3 2024    Age >70                   13
##  4 2025    Age >70                    9
##  5 2022    RiskScore ≥80             16
##  6 2023    RiskScore ≥80             14
##  7 2024    RiskScore ≥80             15
##  8 2025    RiskScore ≥80             11
##  9 2022    Age >70 OR CMI >5         15
## 10 2023    Age >70 OR CMI >5         10
## 11 2024    Age >70 OR CMI >5         15
## 12 2025    Age >70 OR CMI >5          8

11.4 Plot: Patients Selected per Year by Rule

ggplot(selected_all, aes(x = Tx_Year, y = n_selected, color = Rule, group = Rule)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  geom_hline(yintercept = 15, linetype = "dashed", color = "red") +
  labs(title = "Number of Patients Selected per Year by Rule",
       x = "Transplant Year",
       y = "Number of Patients Selected",
       color = "Selection Rule") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Average Number of Patients per Year (Summary)

summary_table <- selected_all %>%
  group_by(Rule) %>%
  summarise(
    Avg_per_year = round(mean(n_selected), 1),
    Min_per_year = min(n_selected),
    Max_per_year = max(n_selected),
    .groups = "drop"
  )

summary_table
## # A tibble: 3 × 4
##   Rule              Avg_per_year Min_per_year Max_per_year
##   <chr>                    <dbl>        <dbl>        <dbl>
## 1 Age >70                     11            8           14
## 2 Age >70 OR CMI >5           12            8           15
## 3 RiskScore ≥80               14           11           16

12 1. Survival by Age >70

surv_data_age70 <- surv_data %>%
  mutate(Group_Age70 = ifelse(age_at_tx > 70, "Age >70", "Age ≤70"))
fit_age70 <- survfit(surv_obj ~ Group_Age70, data = surv_data_age70)

ggsurvplot(fit_age70,
           conf.int = FALSE, pval = TRUE, risk.table = TRUE,
           xlab = "Months since Transplant", ylab = "Survival Probability",
           title = "Survival by Age >70",
           legend.title = "Group",
           break.time.by = 3, xlim = c(0,12))

13 2. Survival by RiskScore ≥80

surv_data_score80 <- surv_data %>%
  mutate(Group_Score80 = ifelse(risk_score >= 80, "RiskScore ≥80", "RiskScore <80"))
fit_score80 <- survfit(surv_obj ~ Group_Score80, data = surv_data_score80)

ggsurvplot(fit_score80,
           conf.int = FALSE, pval = TRUE, risk.table = TRUE,
           xlab = "Months since Transplant", ylab = "Survival Probability",
           title = "Survival by RiskScore ≥80",
           legend.title = "Group",
           break.time.by = 3, xlim = c(0,12))

14 3. Survival by Age >70 OR CMI >5

surv_data_combo <- surv_data %>%
  mutate(Group_Combo = ifelse(age_at_tx > 70 | cmi > 5,
                              "Age >70 OR CMI >5", "Neither"))
fit_combo <- survfit(surv_obj ~ Group_Combo, data = surv_data_combo)

ggsurvplot(fit_combo,
           conf.int = FALSE, pval = TRUE, risk.table = TRUE,
           xlab = "Months since Transplant", ylab = "Survival Probability",
           title = "Survival by Age >70 OR CMI >5",
           legend.title = "Group",
           break.time.by = 3, xlim = c(0,12))