library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.4.3
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
data(nafld1, package = "survival")
## Warning in data(nafld1, package = "survival"): data set 'nafld1' not found
head(nafld1)
## id age male weight height bmi case.id futime status
## 3631 1 57 0 60.0 163 22.69094 10630 6261 0
## 8458 2 67 0 70.4 168 24.88403 14817 624 0
## 6298 3 53 1 105.8 186 30.45354 3 1783 0
## 15398 4 56 1 109.3 170 37.83010 6628 3143 0
## 13261 5 68 1 NA NA NA 1871 1836 1
## 14423 6 39 0 63.9 155 26.61559 7144 1581 0
surv_obj = Surv(nafld1$futime, nafld1$status)
fit = survfit(surv_obj ~ 1)
1.Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year
survfit2(surv_obj ~ 1, data = nafld1) |>
ggsurvfit()
median = summary(fit)$table["median"]
cat("Median Survival Time (days):", median, "\n")
## Median Survival Time (days): NA
time = 1.5 * 365.25
surv = summary(fit, times = time)$surv
cat("Estimated survival probability at 1.5 years:", round(surv, 4), "\n")
## Estimated survival probability at 1.5 years: 0.9856
At no point does the survival probability reach 50% therefore we cannot calculate the median survival time.
2. Plot the survival curves of males and females, test the difference between the curves
survfit2(surv_obj ~ male, data = nafld1) |>
ggsurvfit() +
labs(
x = "Time (days)",
y = "Survival Probability",
color = "Sex (0 = Female, 1 = Male)"
)
sex_diff = survdiff(surv_obj ~ male, data = nafld1)
print(sex_diff)
## Call:
## survdiff(formula = surv_obj ~ male, data = nafld1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348 674 736 5.20 11.3
## male=1 8201 690 628 6.09 11.3
##
## Chisq= 11.3 on 1 degrees of freedom, p= 8e-04
The study started with 1147 more females than males, the females had fewer observed events than expected where males had more observed events than expected and sex is a significant statistic.
3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival
cox_age = coxph(surv_obj ~ age, data = nafld1)
cox_weight = coxph(surv_obj ~ weight, data = nafld1)
cox_height = coxph(surv_obj ~ height, data = nafld1)
cox_bmi = coxph(surv_obj ~ bmi, data = nafld1)
summary(cox_age)
## Call:
## coxph(formula = surv_obj ~ age, data = nafld1)
##
## n= 17549, number of events= 1364
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.098273 1.103264 0.002229 44.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.103 0.9064 1.098 1.108
##
## Concordance= 0.819 (se = 0.007 )
## Likelihood ratio test= 2252 on 1 df, p=<2e-16
## Wald test = 1943 on 1 df, p=<2e-16
## Score (logrank) test = 2190 on 1 df, p=<2e-16
summary(cox_weight)
## Call:
## coxph(formula = surv_obj ~ weight, data = nafld1)
##
## n= 12763, number of events= 1046
## (4786 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## weight -0.005457 0.994558 0.001474 -3.702 0.000214 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## weight 0.9946 1.005 0.9917 0.9974
##
## Concordance= 0.544 (se = 0.01 )
## Likelihood ratio test= 14.1 on 1 df, p=2e-04
## Wald test = 13.7 on 1 df, p=2e-04
## Score (logrank) test = 13.71 on 1 df, p=2e-04
summary(cox_height)
## Call:
## coxph(formula = surv_obj ~ height, data = nafld1)
##
## n= 14381, number of events= 1129
## (3168 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## height -0.022698 0.977558 0.002941 -7.718 1.18e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## height 0.9776 1.023 0.9719 0.9832
##
## Concordance= 0.559 (se = 0.01 )
## Likelihood ratio test= 59.8 on 1 df, p=1e-14
## Wald test = 59.56 on 1 df, p=1e-14
## Score (logrank) test = 59.56 on 1 df, p=1e-14
summary(cox_bmi)
## Call:
## coxph(formula = surv_obj ~ bmi, data = nafld1)
##
## n= 12588, number of events= 1018
## (4961 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## bmi 0.0005176 1.0005177 0.0044921 0.115 0.908
##
## exp(coef) exp(-coef) lower .95 upper .95
## bmi 1.001 0.9995 0.9917 1.009
##
## Concordance= 0.485 (se = 0.011 )
## Likelihood ratio test= 0.01 on 1 df, p=0.9
## Wald test = 0.01 on 1 df, p=0.9
## Score (logrank) test = 0.01 on 1 df, p=0.9
Report and provide rationale for the best multivariate Cox model.
cox_final = coxph(surv_obj ~ male + age + bmi, data = nafld1)
summary(cox_final)
## Call:
## coxph(formula = surv_obj ~ male + age + bmi, data = nafld1)
##
## n= 12588, number of events= 1018
## (4961 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## male 0.366140 1.442158 0.062849 5.826 5.69e-09 ***
## age 0.100601 1.105835 0.002650 37.967 < 2e-16 ***
## bmi 0.017022 1.017168 0.004945 3.443 0.000576 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## male 1.442 0.6934 1.275 1.631
## age 1.106 0.9043 1.100 1.112
## bmi 1.017 0.9831 1.007 1.027
##
## Concordance= 0.827 (se = 0.007 )
## Likelihood ratio test= 1750 on 3 df, p=<2e-16
## Wald test = 1472 on 3 df, p=<2e-16
## Score (logrank) test = 1699 on 3 df, p=<2e-16
I found to use Male, Age, Height, Weight, and BMI, that the first three were significant. Removing BMI made all four significant with weight just squeezing under the cutoff. But to remove Height and Weight and use BMI with Age and Male made all three more significant that any other their numbers on the other models.