Libraries
library(survival)
library(ggsurvfit)
## Loading required package: ggplot2
library(gtsummary)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Load the data
data(nafld, package = "survival")
head(nafld1)
Create a survival object
surv_object = Surv(nafld1$futime, nafld1$status)
survfit(surv_object ~ 1, data = nafld1) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability") +
add_confidence_interval()
Estimating median survival time
survfit(surv_object ~ 1, data = nafld1)
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
##
## n events median 0.95LCL 0.95UCL
## [1,] 17549 1364 NA NA NA
We cannot estimate the median survival time because at no point is there a survival probability of 0.5. At it’s lowest, the survival probability barely drops below 0.7
The estimated survival at 1.5 years is 98.6%
summary(survfit(surv_object ~ 1, data = nafld1), times = (365.25*1.5))
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 548 16305 246 0.986 0.000909 0.984 0.987
survfit(surv_object ~ as.factor(male), data = nafld1) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability"
) +
add_confidence_interval()
survdiff(surv_object ~ as.factor(male), data = nafld1)
## Call:
## survdiff(formula = surv_object ~ as.factor(male), data = nafld1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(male)=0 9348 674 736 5.20 11.3
## as.factor(male)=1 8201 690 628 6.09 11.3
##
## Chisq= 11.3 on 1 degrees of freedom, p= 8e-04
We see that there was a significant difference in overall survival according to sex in the nafld1 data, with a p-value of p = 0.0008.
age_fit = survdiff(surv_object ~ age, data = nafld1)
age_fit$pvalue
## [1] 0
weight_fit = survdiff(surv_object ~ weight, data = nafld1)
weight_fit$pvalue
## [1] 0
height_fit = survdiff(surv_object ~ height, data = nafld1)
height_fit$pvalue
## [1] 1.631926e-12
bmi_fit = survdiff(surv_object ~ round(bmi, 1), data = nafld1)
bmi_fit$pvalue
## [1] 2.70792e-54
All four of these covariates significantly affect survival.
coxph(surv_object ~ male + age + height + weight, data = nafld1) %>%
tbl_regression(exp=TRUE)
Characteristic | HR | 95% CI | p-value |
---|---|---|---|
male | 1.73 | 1.45, 2.06 | <0.001 |
age | 1.10 | 1.10, 1.11 | <0.001 |
height | 0.98 | 0.97, 0.99 | <0.001 |
weight | 1.01 | 1.00, 1.01 | 0.002 |
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio |
The best cox model is
surv_object ~ male + age + height + weight
because it has
the lowest AIC and BIC.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:gtsummary':
##
## select
full_cox = coxph(surv_object ~ male + age + bmi + height + weight, data = nafld1)
full_bic = BIC(full_cox)
aic_full = AIC(full_cox)
cox4 = coxph(surv_object ~ male + age + height + weight, data = nafld1)
bic_4 = BIC(cox4)
aic_4 = AIC(cox4)
cox3 = coxph(surv_object ~ male + age + bmi, data = nafld1)
bic_3 = BIC(cox3)
aic_3 = AIC(cox3)
cox2 = coxph(surv_object ~ male + age, data = nafld1)
bic_2 = BIC(cox2)
aic_2 = AIC(cox2)
cox1 = coxph(surv_object ~ male, data = nafld1)
bic_1 = BIC(cox1)
aic_1 = AIC(cox1)