library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.4.3
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
data(nafld, package = 'survival')
head(nafld1)

create a survival object

surv_object <- Surv(nafld1$futime, nafld1$status)

1. Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

survfit(Surv(futime, status) ~ 1, data = nafld1) |>
  ggsurvfit() +
  labs(x = 'Days', y = 'Overall survival probability') + 
  add_confidence_interval()

survfit(surv_object ~ 1, data = nafld1)
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
## 
##          n events median 0.95LCL 0.95UCL
## [1,] 17549   1364     NA      NA      NA
summary(survfit(surv_object ~ 1, data = nafld1), times = (547.5))
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##   548  16305     246    0.986 0.000909        0.984        0.987

At 1.5 years (548 days), about 98.6% of people survived.

2. Plot the survival curves of males and females, test the difference between the curves

survfit(Surv(futime, status) ~ male, data = nafld1) |>
  ggsurvfit() +
  add_confidence_interval() +
  add_pvalue()
## ! `add_pvalue()` works with objects created with `survfit2()` or
##   `tidycmprsk::cuminc()`.
## ℹ `add_pvalue()` has been ignored.

survdiff(Surv(futime, status) ~ as.factor(male), data = nafld1)
## Call:
## survdiff(formula = Surv(futime, status) ~ as.factor(male), data = nafld1)
## 
##                      N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(male)=0 9348      674      736      5.20      11.3
## as.factor(male)=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

There’s a significant difference in survival between males and females (p = 0.0008), meaning sex affects survival in this dataset.

3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

covariates <- c("age", "weight", "height", "bmi")
univariate_results <- list()

for (i in 1:length(covariates)) {
  model <- coxph(as.formula(paste("Surv(futime, status) ~", covariates[i])), data = nafld1)
  univariate_results[[i]] <- model
  cat(covariates[i], "p-value:",
      summary(model)$coefficients[,"Pr(>|z|)"], "\n")
}
## age p-value: 0 
## weight p-value: 0.0002140244 
## height p-value: 1.184007e-14 
## bmi p-value: 0.9082656

Age, weight, and height significantly affect survival, but BMI does not.

4. Report and provide rationale for the best multivariate Cox model.

full_model <- coxph(surv_object ~ age + weight + height + bmi + as.factor(male), data = nafld1)

summary(full_model)
## Call:
## coxph(formula = surv_object ~ age + weight + height + bmi + as.factor(male), 
##     data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##                       coef exp(coef)  se(coef)      z Pr(>|z|)    
## age               0.098330  1.103327  0.002771 35.483  < 2e-16 ***
## weight            0.009734  1.009781  0.012206  0.797    0.425    
## height           -0.022267  0.977979  0.013629 -1.634    0.102    
## bmi              -0.011134  0.988927  0.033234 -0.335    0.738    
## as.factor(male)1  0.550169  1.733547  0.091513  6.012 1.83e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## age                 1.1033     0.9063    1.0974     1.109
## weight              1.0098     0.9903    0.9859     1.034
## height              0.9780     1.0225    0.9522     1.004
## bmi                 0.9889     1.0112    0.9266     1.055
## as.factor(male)1    1.7335     0.5769    1.4489     2.074
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1758  on 5 df,   p=<2e-16
## Wald test            = 1489  on 5 df,   p=<2e-16
## Score (logrank) test = 1729  on 5 df,   p=<2e-16

In the full model, age and male significantly increase the risk of death, while weight, height, and BMI are not significant. The model fits well with a high concordance of 0.83.

# Quick clean and model
stepwise_model <- step(coxph(Surv(futime, status) ~ age + weight + height + bmi + as.factor(male), 
                             data = na.omit(nafld1)), 
                       direction = "both")
## Start:  AIC=15875.52
## Surv(futime, status) ~ age + weight + height + bmi + as.factor(male)
## 
##                   Df   AIC
## - bmi              1 15874
## - weight           1 15874
## <none>               15876
## - height           1 15877
## - as.factor(male)  1 15911
## - age              1 17416
## 
## Step:  AIC=15873.72
## Surv(futime, status) ~ age + weight + height + as.factor(male)
## 
##                   Df   AIC
## <none>               15874
## + bmi              1 15876
## - weight           1 15881
## - height           1 15888
## - as.factor(male)  1 15910
## - age              1 17419
summary(stepwise_model)
## Call:
## coxph(formula = Surv(futime, status) ~ age + weight + height + 
##     as.factor(male), data = na.omit(nafld1))
## 
##   n= 12562, number of events= 1012 
## 
##                       coef exp(coef)  se(coef)      z Pr(>|z|)    
## age               0.098329  1.103326  0.002774 35.452  < 2e-16 ***
## weight            0.005647  1.005663  0.001837  3.074  0.00211 ** 
## height           -0.018697  0.981477  0.004608 -4.058 4.95e-05 ***
## as.factor(male)1  0.564604  1.758752  0.090652  6.228 4.72e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## age                 1.1033     0.9064    1.0973    1.1093
## weight              1.0057     0.9944    1.0020    1.0093
## height              0.9815     1.0189    0.9727    0.9904
## as.factor(male)1    1.7588     0.5686    1.4725    2.1007
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1753  on 4 df,   p=<2e-16
## Wald test            = 1486  on 4 df,   p=<2e-16
## Score (logrank) test = 1726  on 4 df,   p=<2e-16

The final model shows that age, weight, height, and being male all affect survival. Age and being male have the biggest impact. BMI was dropped because it didn’t help the model. The model is strong with a concordance of 0.83.