library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.4.3
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
data(nafld1, package = "survival")
## Warning in data(nafld1, package = "survival"): data set 'nafld1' not found
head(nafld1)
##       id age male weight height      bmi case.id futime status
## 3631   1  57    0   60.0    163 22.69094   10630   6261      0
## 8458   2  67    0   70.4    168 24.88403   14817    624      0
## 6298   3  53    1  105.8    186 30.45354       3   1783      0
## 15398  4  56    1  109.3    170 37.83010    6628   3143      0
## 13261  5  68    1     NA     NA       NA    1871   1836      1
## 14423  6  39    0   63.9    155 26.61559    7144   1581      0
surv_obj = Surv(nafld1$futime, nafld1$status)
fit = survfit(surv_obj ~ 1)

1.Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

survfit2(surv_obj ~ 1, data = nafld1) |>
  ggsurvfit() 

median = summary(fit)$table["median"]
cat("Median Survival Time (days):", median, "\n")
## Median Survival Time (days): NA
time = 1.5 * 365.25
surv = summary(fit, times = time)$surv
cat("Estimated survival probability at 1.5 years:", round(surv, 4), "\n")
## Estimated survival probability at 1.5 years: 0.9856

At no point does the survival probability reach 50% therefore we cannot calculate the median survival time.

2. Plot the survival curves of males and females, test the difference between the curves

survfit2(surv_obj ~ male, data = nafld1) |>
  ggsurvfit() +
  labs(
    x = "Time (days)",
    y = "Survival Probability",
    color = "Sex (0 = Female, 1 = Male)"
  )

sex_diff = survdiff(surv_obj ~ male, data = nafld1)
print(sex_diff)
## Call:
## survdiff(formula = surv_obj ~ male, data = nafld1)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348      674      736      5.20      11.3
## male=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

The study started with 1147 more females than males, the females had fewer observed events than expected where males had more observed events than expected and sex is a significant statistic.

3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

cox_age    = coxph(surv_obj ~ age, data = nafld1)
cox_weight = coxph(surv_obj ~ weight, data = nafld1)
cox_height = coxph(surv_obj ~ height, data = nafld1)
cox_bmi    = coxph(surv_obj ~ bmi, data = nafld1)

summary(cox_age)
## Call:
## coxph(formula = surv_obj ~ age, data = nafld1)
## 
##   n= 17549, number of events= 1364 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)    
## age 0.098273  1.103264 0.002229 44.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.103     0.9064     1.098     1.108
## 
## Concordance= 0.819  (se = 0.007 )
## Likelihood ratio test= 2252  on 1 df,   p=<2e-16
## Wald test            = 1943  on 1 df,   p=<2e-16
## Score (logrank) test = 2190  on 1 df,   p=<2e-16
summary(cox_weight)
## Call:
## coxph(formula = surv_obj ~ weight, data = nafld1)
## 
##   n= 12763, number of events= 1046 
##    (4786 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## weight -0.005457  0.994558  0.001474 -3.702 0.000214 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## weight    0.9946      1.005    0.9917    0.9974
## 
## Concordance= 0.544  (se = 0.01 )
## Likelihood ratio test= 14.1  on 1 df,   p=2e-04
## Wald test            = 13.7  on 1 df,   p=2e-04
## Score (logrank) test = 13.71  on 1 df,   p=2e-04
summary(cox_height)
## Call:
## coxph(formula = surv_obj ~ height, data = nafld1)
## 
##   n= 14381, number of events= 1129 
##    (3168 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## height -0.022698  0.977558  0.002941 -7.718 1.18e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## height    0.9776      1.023    0.9719    0.9832
## 
## Concordance= 0.559  (se = 0.01 )
## Likelihood ratio test= 59.8  on 1 df,   p=1e-14
## Wald test            = 59.56  on 1 df,   p=1e-14
## Score (logrank) test = 59.56  on 1 df,   p=1e-14
summary(cox_bmi)
## Call:
## coxph(formula = surv_obj ~ bmi, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef)  se(coef)     z Pr(>|z|)
## bmi 0.0005176 1.0005177 0.0044921 0.115    0.908
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## bmi     1.001     0.9995    0.9917     1.009
## 
## Concordance= 0.485  (se = 0.011 )
## Likelihood ratio test= 0.01  on 1 df,   p=0.9
## Wald test            = 0.01  on 1 df,   p=0.9
## Score (logrank) test = 0.01  on 1 df,   p=0.9

Report and provide rationale for the best multivariate Cox model.

cox_final = coxph(surv_obj ~ male + age + bmi, data = nafld1)
summary(cox_final)
## Call:
## coxph(formula = surv_obj ~ male + age + bmi, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## male 0.366140  1.442158 0.062849  5.826 5.69e-09 ***
## age  0.100601  1.105835 0.002650 37.967  < 2e-16 ***
## bmi  0.017022  1.017168 0.004945  3.443 0.000576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## male     1.442     0.6934     1.275     1.631
## age      1.106     0.9043     1.100     1.112
## bmi      1.017     0.9831     1.007     1.027
## 
## Concordance= 0.827  (se = 0.007 )
## Likelihood ratio test= 1750  on 3 df,   p=<2e-16
## Wald test            = 1472  on 3 df,   p=<2e-16
## Score (logrank) test = 1699  on 3 df,   p=<2e-16

I found to use Male, Age, Height, Weight, and BMI, that the first three were significant. Removing BMI made all four significant with weight just squeezing under the cutoff. But to remove Height and Weight and use BMI with Age and Male made all three more significant that any other their numbers on the other models.