Survival_Analysis

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(survival)

## Warning: package 'survival' was built under R version 4.4.3

library(ggsurvfit)

## Warning: package 'ggsurvfit' was built under R version 4.4.3

## Loading required package: ggplot2

library(knitr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tibble)
library(RColorBrewer) 
library(survminer)

## Warning: package 'survminer' was built under R version 4.4.3

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(gtsummary)

## Warning: package 'gtsummary' was built under R version 4.4.3

data(nafld, package = "survival")
nafld1 <- nafld1 %>%
  mutate(sex = factor(male, labels = c("Female","Male")))

# Summary of key variables
glimpse(nafld1)

## Rows: 17,549
## Columns: 10
## $ id      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ age     <int> 57, 67, 53, 56, 68, 39, 49, 30, 47, 79, 47, 59, 54, 41, 51, 73…
## $ male    <int> 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ weight  <dbl> 60.0, 70.4, 105.8, 109.3, NA, 63.9, 66.2, NA, 110.8, 56.6, NA,…
## $ height  <int> 163, 168, 186, 170, NA, 155, 161, 180, 188, 155, NA, 182, 167,…
## $ bmi     <dbl> 22.69094, 24.88403, 30.45354, 37.83010, NA, 26.61559, 25.51934…
## $ case.id <int> 10630, 14817, 3, 6628, 1871, 7144, 7507, 13417, 9, 13518, 9700…
## $ futime  <int> 6261, 624, 1783, 3143, 1836, 1581, 3109, 1339, 1671, 2239, 489…
## $ status  <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sex     <fct> Female, Female, Male, Male, Male, Female, Female, Male, Male, …

table(nafld1$status)

## 
##     0     1 
## 16185  1364

table(nafld1$sex)

## 
## Female   Male 
##   9348   8201

summary(nafld1[, c("age","weight","height","bmi")])

##       age            weight           height           bmi        
##  Min.   :18.00   Min.   : 33.40   Min.   :123.0   Min.   : 9.207  
##  1st Qu.:42.00   1st Qu.: 70.00   1st Qu.:162.0   1st Qu.:25.136  
##  Median :53.00   Median : 83.90   Median :169.0   Median :28.876  
##  Mean   :52.66   Mean   : 86.35   Mean   :169.4   Mean   :30.074  
##  3rd Qu.:63.00   3rd Qu.: 99.20   3rd Qu.:177.0   3rd Qu.:33.710  
##  Max.   :98.00   Max.   :181.70   Max.   :215.0   Max.   :84.396  
##                  NA's   :4786     NA's   :3168    NA's   :4961

1. Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

surv_fit <- survfit2(Surv(futime, status) ~ 1, data = nafld1)

surv_fit %>%
  ggsurvfit() +
  labs(
    x = "Days",
    y = "Overall survival probability",
    title = "Kaplan–Meier Survival Curve with 95% CI"
  ) +
  add_confidence_interval(fill = "lightblue", alpha = 0.5) + 
  add_risktable()

### Estimating 1.5-year survival

summary(survfit(Surv(futime, status) ~ 1, data = nafld1), times = 1.5*365.25)

## Call: survfit(formula = Surv(futime, status) ~ 1, data = nafld1)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##   548  16305     246    0.986 0.000909        0.984        0.987

library(gtsummary)
survfit(Surv(futime, status) ~ 1, data = nafld1) %>% 
  tbl_survfit(
    times = 1.5*365.25,
    label_header = "**1.5 - year survival (95% CI)**"
  )

Characteristic	1.5 - year survival (95% CI)
Overall	99% (98%, 99%)

survfit(Surv(futime, status) ~ 1, data = nafld1)

## Call: survfit(formula = Surv(futime, status) ~ 1, data = nafld1)
## 
##          n events median 0.95LCL 0.95UCL
## [1,] 17549   1364     NA      NA      NA

Interpretation:

The median survival time is NA because fewer than 50% of the population experienced the event (status = 1). As a result, the survival curve never dropped below 50%.

This is expected, since:

-   Only 1,364 out of 17,549 participants (~7.8%) had the event.

-   Over 92% were censored — likely still alive at end of follow-up.

nafld1 %>% 
  filter(status == 1) %>% 
  summarize(median_surv = median(futime))

##   median_surv
## 1      1686.5

2. Plot the survival curves of males and females, test the difference between the curves

library(survminer)

# Survival fit by sex (male: 1, female: 0)
fit_sex <- survfit(Surv(futime, status) ~ as.factor(male), data = nafld1)

# Plot
ggsurvplot(
  fit_sex,
  data = nafld1,
  pval = TRUE,           # Log-rank test p-value
  conf.int = TRUE,
  legend.labs = c("Female", "Male"),
  legend.title = "Sex",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  ggtheme = theme_minimal(),
  palette = c("#E69F00", "#56B4E9")
)

survdiff(Surv(futime, status) ~ male, data = nafld1)

## Call:
## survdiff(formula = Surv(futime, status) ~ male, data = nafld1)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348      674      736      5.20      11.3
## male=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

Interpretation: A significant p-value here indicates different survival between males and females. Males have slightly worse survival than females.

3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

surv_object <- Surv(nafld1$futime, nafld1$status)
covariates <- c("age", "weight", "height", "bmi")

for (v in covariates) {
  model <- coxph(as.formula(paste0("surv_object ~ ", v)), data = nafld1)
  cat("\n\n-----", v, "-----\n")
  print(summary(model))
}

## 
## 
## ----- age -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 17549, number of events= 1364 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)    
## age 0.098273  1.103264 0.002229 44.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.103     0.9064     1.098     1.108
## 
## Concordance= 0.819  (se = 0.007 )
## Likelihood ratio test= 2252  on 1 df,   p=<2e-16
## Wald test            = 1943  on 1 df,   p=<2e-16
## Score (logrank) test = 2190  on 1 df,   p=<2e-16
## 
## 
## 
## ----- weight -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 12763, number of events= 1046 
##    (4786 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## weight -0.005457  0.994558  0.001474 -3.702 0.000214 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## weight    0.9946      1.005    0.9917    0.9974
## 
## Concordance= 0.544  (se = 0.01 )
## Likelihood ratio test= 14.1  on 1 df,   p=2e-04
## Wald test            = 13.7  on 1 df,   p=2e-04
## Score (logrank) test = 13.71  on 1 df,   p=2e-04
## 
## 
## 
## ----- height -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 14381, number of events= 1129 
##    (3168 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## height -0.022698  0.977558  0.002941 -7.718 1.18e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## height    0.9776      1.023    0.9719    0.9832
## 
## Concordance= 0.559  (se = 0.01 )
## Likelihood ratio test= 59.8  on 1 df,   p=1e-14
## Wald test            = 59.56  on 1 df,   p=1e-14
## Score (logrank) test = 59.56  on 1 df,   p=1e-14
## 
## 
## 
## ----- bmi -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef)  se(coef)     z Pr(>|z|)
## bmi 0.0005176 1.0005177 0.0044921 0.115    0.908
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## bmi     1.001     0.9995    0.9917     1.009
## 
## Concordance= 0.485  (se = 0.011 )
## Likelihood ratio test= 0.01  on 1 df,   p=0.9
## Wald test            = 0.01  on 1 df,   p=0.9
## Score (logrank) test = 0.01  on 1 df,   p=0.9

Inferences:

Age: Strong, significant predictor. Each year increase in age raises hazard by 10.3% (HR = 1.103, p < 2e-16).
Weight: Small but significant inverse relationship (HR = 0.995, p = 0.0002).
Height: Taller individuals had lower hazard (HR = 0.978, p < 0.0001).
BMI: Not statistically significant (p = 0.91), and does not affect survival in this population

4. Report and provide rationale for the best multivariate Cox model.

fit_final <- coxph(Surv(futime, status) ~ age + male + height + weight, data = nafld1)
summary(fit_final)

## Call:
## coxph(formula = Surv(futime, status) ~ age + male + height + 
##     weight, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## age     0.098361  1.103361  0.002769 35.521  < 2e-16 ***
## male    0.545900  1.726162  0.090392  6.039 1.55e-09 ***
## height -0.017951  0.982209  0.004597 -3.905 9.43e-05 ***
## weight  0.005686  1.005702  0.001832  3.104  0.00191 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## age       1.1034     0.9063    1.0974    1.1094
## male      1.7262     0.5793    1.4459    2.0607
## height    0.9822     1.0181    0.9734    0.9911
## weight    1.0057     0.9943    1.0021    1.0093
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1758  on 4 df,   p=<2e-16
## Wald test            = 1490  on 4 df,   p=<2e-16
## Score (logrank) test = 1729  on 4 df,   p=<2e-16

cox.zph(fit_final)

##          chisq df      p
## age     2.3751  1 0.1233
## male    0.0274  1 0.8684
## height  1.4784  1 0.2240
## weight  4.3252  1 0.0376
## GLOBAL 13.4690  4 0.0092

AIC(fit_final, fit_simple = coxph(Surv(futime, status) ~ age + male, data = nafld1))

##                                                         df      AIC
## fit_final                                                4 15977.05
## coxph(Surv(futime, status) ~ age + male, data = nafld1)  2 22167.86

Rationale for Final Model

Age & Sex: Both highly significant (p < 0.001) with strong effect sizes and zero missingness—core predictors.
Height & Weight: Statistically significant (p < 0.001 and p = 0.002) and clinically relevant body‐size measures; including both retains a large sample (n ≈ 12,600).
Exclude BMI: Non‐significant and would further reduce sample size due to missing data.
Model Performance: Achieves highest concordance (0.83) and lowest AIC.
Assumptions: Minor PH violation for weight is acceptable; can be addressed in sensitivity analyses (e.g., time‐varying effect).

Survival_Analysis

Sreeja Yalamaddi

2025-05-12

1. Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

2. Plot the survival curves of males and females, test the difference between the curves

3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

4. Report and provide rationale for the best multivariate Cox model.