knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(survival)
## Warning: package 'survival' was built under R version 4.4.3
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
## Loading required package: ggplot2
library(knitr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tibble)
library(RColorBrewer)
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
data(nafld, package = "survival")
nafld1 <- nafld1 %>%
mutate(sex = factor(male, labels = c("Female","Male")))
# Summary of key variables
glimpse(nafld1)
## Rows: 17,549
## Columns: 10
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ age <int> 57, 67, 53, 56, 68, 39, 49, 30, 47, 79, 47, 59, 54, 41, 51, 73…
## $ male <int> 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ weight <dbl> 60.0, 70.4, 105.8, 109.3, NA, 63.9, 66.2, NA, 110.8, 56.6, NA,…
## $ height <int> 163, 168, 186, 170, NA, 155, 161, 180, 188, 155, NA, 182, 167,…
## $ bmi <dbl> 22.69094, 24.88403, 30.45354, 37.83010, NA, 26.61559, 25.51934…
## $ case.id <int> 10630, 14817, 3, 6628, 1871, 7144, 7507, 13417, 9, 13518, 9700…
## $ futime <int> 6261, 624, 1783, 3143, 1836, 1581, 3109, 1339, 1671, 2239, 489…
## $ status <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sex <fct> Female, Female, Male, Male, Male, Female, Female, Male, Male, …
table(nafld1$status)
##
## 0 1
## 16185 1364
table(nafld1$sex)
##
## Female Male
## 9348 8201
summary(nafld1[, c("age","weight","height","bmi")])
## age weight height bmi
## Min. :18.00 Min. : 33.40 Min. :123.0 Min. : 9.207
## 1st Qu.:42.00 1st Qu.: 70.00 1st Qu.:162.0 1st Qu.:25.136
## Median :53.00 Median : 83.90 Median :169.0 Median :28.876
## Mean :52.66 Mean : 86.35 Mean :169.4 Mean :30.074
## 3rd Qu.:63.00 3rd Qu.: 99.20 3rd Qu.:177.0 3rd Qu.:33.710
## Max. :98.00 Max. :181.70 Max. :215.0 Max. :84.396
## NA's :4786 NA's :3168 NA's :4961
surv_fit <- survfit2(Surv(futime, status) ~ 1, data = nafld1)
surv_fit %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability",
title = "Kaplan–Meier Survival Curve with 95% CI"
) +
add_confidence_interval(fill = "lightblue", alpha = 0.5) +
add_risktable()
### Estimating 1.5-year survival
summary(survfit(Surv(futime, status) ~ 1, data = nafld1), times = 1.5*365.25)
## Call: survfit(formula = Surv(futime, status) ~ 1, data = nafld1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 548 16305 246 0.986 0.000909 0.984 0.987
library(gtsummary)
survfit(Surv(futime, status) ~ 1, data = nafld1) %>%
tbl_survfit(
times = 1.5*365.25,
label_header = "**1.5 - year survival (95% CI)**"
)
| Characteristic | 1.5 - year survival (95% CI) |
|---|---|
| Overall | 99% (98%, 99%) |
survfit(Surv(futime, status) ~ 1, data = nafld1)
## Call: survfit(formula = Surv(futime, status) ~ 1, data = nafld1)
##
## n events median 0.95LCL 0.95UCL
## [1,] 17549 1364 NA NA NA
Interpretation:
This is expected, since:
- Only 1,364 out of 17,549 participants (~7.8%) had the event.
- Over 92% were censored — likely still alive at end of follow-up.
nafld1 %>%
filter(status == 1) %>%
summarize(median_surv = median(futime))
## median_surv
## 1 1686.5
library(survminer)
# Survival fit by sex (male: 1, female: 0)
fit_sex <- survfit(Surv(futime, status) ~ as.factor(male), data = nafld1)
# Plot
ggsurvplot(
fit_sex,
data = nafld1,
pval = TRUE, # Log-rank test p-value
conf.int = TRUE,
legend.labs = c("Female", "Male"),
legend.title = "Sex",
xlab = "Time (days)",
ylab = "Survival Probability",
ggtheme = theme_minimal(),
palette = c("#E69F00", "#56B4E9")
)
survdiff(Surv(futime, status) ~ male, data = nafld1)
## Call:
## survdiff(formula = Surv(futime, status) ~ male, data = nafld1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348 674 736 5.20 11.3
## male=1 8201 690 628 6.09 11.3
##
## Chisq= 11.3 on 1 degrees of freedom, p= 8e-04
Interpretation: A significant p-value here indicates different survival between males and females. Males have slightly worse survival than females.
surv_object <- Surv(nafld1$futime, nafld1$status)
covariates <- c("age", "weight", "height", "bmi")
for (v in covariates) {
model <- coxph(as.formula(paste0("surv_object ~ ", v)), data = nafld1)
cat("\n\n-----", v, "-----\n")
print(summary(model))
}
##
##
## ----- age -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
##
## n= 17549, number of events= 1364
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.098273 1.103264 0.002229 44.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.103 0.9064 1.098 1.108
##
## Concordance= 0.819 (se = 0.007 )
## Likelihood ratio test= 2252 on 1 df, p=<2e-16
## Wald test = 1943 on 1 df, p=<2e-16
## Score (logrank) test = 2190 on 1 df, p=<2e-16
##
##
##
## ----- weight -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
##
## n= 12763, number of events= 1046
## (4786 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## weight -0.005457 0.994558 0.001474 -3.702 0.000214 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## weight 0.9946 1.005 0.9917 0.9974
##
## Concordance= 0.544 (se = 0.01 )
## Likelihood ratio test= 14.1 on 1 df, p=2e-04
## Wald test = 13.7 on 1 df, p=2e-04
## Score (logrank) test = 13.71 on 1 df, p=2e-04
##
##
##
## ----- height -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
##
## n= 14381, number of events= 1129
## (3168 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## height -0.022698 0.977558 0.002941 -7.718 1.18e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## height 0.9776 1.023 0.9719 0.9832
##
## Concordance= 0.559 (se = 0.01 )
## Likelihood ratio test= 59.8 on 1 df, p=1e-14
## Wald test = 59.56 on 1 df, p=1e-14
## Score (logrank) test = 59.56 on 1 df, p=1e-14
##
##
##
## ----- bmi -----
## Call:
## coxph(formula = as.formula(paste0("surv_object ~ ", v)), data = nafld1)
##
## n= 12588, number of events= 1018
## (4961 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## bmi 0.0005176 1.0005177 0.0044921 0.115 0.908
##
## exp(coef) exp(-coef) lower .95 upper .95
## bmi 1.001 0.9995 0.9917 1.009
##
## Concordance= 0.485 (se = 0.011 )
## Likelihood ratio test= 0.01 on 1 df, p=0.9
## Wald test = 0.01 on 1 df, p=0.9
## Score (logrank) test = 0.01 on 1 df, p=0.9
Inferences:
Age: Strong, significant predictor. Each year increase in age raises hazard by 10.3% (HR = 1.103, p < 2e-16).
Weight: Small but significant inverse relationship (HR = 0.995, p = 0.0002).
Height: Taller individuals had lower hazard (HR = 0.978, p < 0.0001).
BMI: Not statistically significant (p = 0.91), and does not affect survival in this population
fit_final <- coxph(Surv(futime, status) ~ age + male + height + weight, data = nafld1)
summary(fit_final)
## Call:
## coxph(formula = Surv(futime, status) ~ age + male + height +
## weight, data = nafld1)
##
## n= 12588, number of events= 1018
## (4961 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.098361 1.103361 0.002769 35.521 < 2e-16 ***
## male 0.545900 1.726162 0.090392 6.039 1.55e-09 ***
## height -0.017951 0.982209 0.004597 -3.905 9.43e-05 ***
## weight 0.005686 1.005702 0.001832 3.104 0.00191 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.1034 0.9063 1.0974 1.1094
## male 1.7262 0.5793 1.4459 2.0607
## height 0.9822 1.0181 0.9734 0.9911
## weight 1.0057 0.9943 1.0021 1.0093
##
## Concordance= 0.828 (se = 0.007 )
## Likelihood ratio test= 1758 on 4 df, p=<2e-16
## Wald test = 1490 on 4 df, p=<2e-16
## Score (logrank) test = 1729 on 4 df, p=<2e-16
cox.zph(fit_final)
## chisq df p
## age 2.3751 1 0.1233
## male 0.0274 1 0.8684
## height 1.4784 1 0.2240
## weight 4.3252 1 0.0376
## GLOBAL 13.4690 4 0.0092
AIC(fit_final, fit_simple = coxph(Surv(futime, status) ~ age + male, data = nafld1))
## df AIC
## fit_final 4 15977.05
## coxph(Surv(futime, status) ~ age + male, data = nafld1) 2 22167.86
Rationale for Final Model