knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(survival)
## Warning: package 'survival' was built under R version 4.4.3
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
## Loading required package: ggplot2
library(knitr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tibble)
library(RColorBrewer) 
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
data(nafld, package = "survival")
nafld1 <- nafld1 %>%
  mutate(sex = factor(male, labels = c("Female","Male")))

# Summary of key variables
glimpse(nafld1)
## Rows: 17,549
## Columns: 10
## $ id      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ age     <int> 57, 67, 53, 56, 68, 39, 49, 30, 47, 79, 47, 59, 54, 41, 51, 73…
## $ male    <int> 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ weight  <dbl> 60.0, 70.4, 105.8, 109.3, NA, 63.9, 66.2, NA, 110.8, 56.6, NA,…
## $ height  <int> 163, 168, 186, 170, NA, 155, 161, 180, 188, 155, NA, 182, 167,…
## $ bmi     <dbl> 22.69094, 24.88403, 30.45354, 37.83010, NA, 26.61559, 25.51934…
## $ case.id <int> 10630, 14817, 3, 6628, 1871, 7144, 7507, 13417, 9, 13518, 9700…
## $ futime  <int> 6261, 624, 1783, 3143, 1836, 1581, 3109, 1339, 1671, 2239, 489…
## $ status  <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sex     <fct> Female, Female, Male, Male, Male, Female, Female, Male, Male, …
table(nafld1$status)
## 
##     0     1 
## 16185  1364
table(nafld1$sex)
## 
## Female   Male 
##   9348   8201
summary(nafld1[, c("age","weight","height","bmi")])
##       age            weight           height           bmi        
##  Min.   :18.00   Min.   : 33.40   Min.   :123.0   Min.   : 9.207  
##  1st Qu.:42.00   1st Qu.: 70.00   1st Qu.:162.0   1st Qu.:25.136  
##  Median :53.00   Median : 83.90   Median :169.0   Median :28.876  
##  Mean   :52.66   Mean   : 86.35   Mean   :169.4   Mean   :30.074  
##  3rd Qu.:63.00   3rd Qu.: 99.20   3rd Qu.:177.0   3rd Qu.:33.710  
##  Max.   :98.00   Max.   :181.70   Max.   :215.0   Max.   :84.396  
##                  NA's   :4786     NA's   :3168    NA's   :4961

1. Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

surv_fit <- survfit2(Surv(futime, status) ~ 1, data = nafld1)

surv_fit %>%
  ggsurvfit() +
  labs(
    x = "Days",
    y = "Overall survival probability",
    title = "Kaplan–Meier Survival Curve with 95% CI"
  ) +
  add_confidence_interval(fill = "lightblue", alpha = 0.5) + 
  add_risktable()

### Estimating 1.5-year survival

summary(survfit(Surv(futime, status) ~ 1, data = nafld1), times = 1.5*365.25)
## Call: survfit(formula = Surv(futime, status) ~ 1, data = nafld1)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##   548  16305     246    0.986 0.000909        0.984        0.987
library(gtsummary)
survfit(Surv(futime, status) ~ 1, data = nafld1) %>% 
  tbl_survfit(
    times = 1.5*365.25,
    label_header = "**1.5 - year survival (95% CI)**"
  )
Characteristic 1.5 - year survival (95% CI)
Overall 99% (98%, 99%)
survfit(Surv(futime, status) ~ 1, data = nafld1)
## Call: survfit(formula = Surv(futime, status) ~ 1, data = nafld1)
## 
##          n events median 0.95LCL 0.95UCL
## [1,] 17549   1364     NA      NA      NA

Interpretation:

This is expected, since:

-   Only 1,364 out of 17,549 participants (~7.8%) had the event.

-   Over 92% were censored — likely still alive at end of follow-up.
nafld1 %>% 
  filter(status == 1) %>% 
  summarize(median_surv = median(futime))
##   median_surv
## 1      1686.5

2. Plot the survival curves of males and females, test the difference between the curves

library(survminer)

# Survival fit by sex (male: 1, female: 0)
fit_sex <- survfit(Surv(futime, status) ~ as.factor(male), data = nafld1)

# Plot
ggsurvplot(
  fit_sex,
  data = nafld1,
  pval = TRUE,           # Log-rank test p-value
  conf.int = TRUE,
  legend.labs = c("Female", "Male"),
  legend.title = "Sex",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  ggtheme = theme_minimal(),
  palette = c("#E69F00", "#56B4E9")
)

survdiff(Surv(futime, status) ~ male, data = nafld1)
## Call:
## survdiff(formula = Surv(futime, status) ~ male, data = nafld1)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348      674      736      5.20      11.3
## male=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

Interpretation: A significant p-value here indicates different survival between males and females. Males have slightly worse survival than females.

3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

surv_object <- Surv(nafld1$futime, nafld1$status)
covariates <- c("age", "weight", "height", "bmi")

for (v in covariates) {
  model <- coxph(as.formula(paste0("surv_object ~ ", v)), data = nafld1)
  cat("\n\n-----", v, "-----\n")
  print(summary(model))
}
## 
## 
## ----- age -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 17549, number of events= 1364 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)    
## age 0.098273  1.103264 0.002229 44.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.103     0.9064     1.098     1.108
## 
## Concordance= 0.819  (se = 0.007 )
## Likelihood ratio test= 2252  on 1 df,   p=<2e-16
## Wald test            = 1943  on 1 df,   p=<2e-16
## Score (logrank) test = 2190  on 1 df,   p=<2e-16
## 
## 
## 
## ----- weight -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 12763, number of events= 1046 
##    (4786 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## weight -0.005457  0.994558  0.001474 -3.702 0.000214 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## weight    0.9946      1.005    0.9917    0.9974
## 
## Concordance= 0.544  (se = 0.01 )
## Likelihood ratio test= 14.1  on 1 df,   p=2e-04
## Wald test            = 13.7  on 1 df,   p=2e-04
## Score (logrank) test = 13.71  on 1 df,   p=2e-04
## 
## 
## 
## ----- height -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 14381, number of events= 1129 
##    (3168 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## height -0.022698  0.977558  0.002941 -7.718 1.18e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## height    0.9776      1.023    0.9719    0.9832
## 
## Concordance= 0.559  (se = 0.01 )
## Likelihood ratio test= 59.8  on 1 df,   p=1e-14
## Wald test            = 59.56  on 1 df,   p=1e-14
## Score (logrank) test = 59.56  on 1 df,   p=1e-14
## 
## 
## 
## ----- bmi -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef)  se(coef)     z Pr(>|z|)
## bmi 0.0005176 1.0005177 0.0044921 0.115    0.908
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## bmi     1.001     0.9995    0.9917     1.009
## 
## Concordance= 0.485  (se = 0.011 )
## Likelihood ratio test= 0.01  on 1 df,   p=0.9
## Wald test            = 0.01  on 1 df,   p=0.9
## Score (logrank) test = 0.01  on 1 df,   p=0.9

Inferences:

4. Report and provide rationale for the best multivariate Cox model.

fit_final <- coxph(Surv(futime, status) ~ age + male + height + weight, data = nafld1)
summary(fit_final)
## Call:
## coxph(formula = Surv(futime, status) ~ age + male + height + 
##     weight, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## age     0.098361  1.103361  0.002769 35.521  < 2e-16 ***
## male    0.545900  1.726162  0.090392  6.039 1.55e-09 ***
## height -0.017951  0.982209  0.004597 -3.905 9.43e-05 ***
## weight  0.005686  1.005702  0.001832  3.104  0.00191 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## age       1.1034     0.9063    1.0974    1.1094
## male      1.7262     0.5793    1.4459    2.0607
## height    0.9822     1.0181    0.9734    0.9911
## weight    1.0057     0.9943    1.0021    1.0093
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1758  on 4 df,   p=<2e-16
## Wald test            = 1490  on 4 df,   p=<2e-16
## Score (logrank) test = 1729  on 4 df,   p=<2e-16
cox.zph(fit_final)
##          chisq df      p
## age     2.3751  1 0.1233
## male    0.0274  1 0.8684
## height  1.4784  1 0.2240
## weight  4.3252  1 0.0376
## GLOBAL 13.4690  4 0.0092
AIC(fit_final, fit_simple = coxph(Surv(futime, status) ~ age + male, data = nafld1))
##                                                         df      AIC
## fit_final                                                4 15977.05
## coxph(Surv(futime, status) ~ age + male, data = nafld1)  2 22167.86

Rationale for Final Model