install.packages("survminer")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'survminer' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\RtmpCqo4x0\downloaded_packages
install.packages("ggsurvfit")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggsurvfit' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\RtmpCqo4x0\downloaded_packages
library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.4.3
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
data(nafld, package = "survival")
head(nafld1)
## id age male weight height bmi case.id futime status
## 3631 1 57 0 60.0 163 22.69094 10630 6261 0
## 8458 2 67 0 70.4 168 24.88403 14817 624 0
## 6298 3 53 1 105.8 186 30.45354 3 1783 0
## 15398 4 56 1 109.3 170 37.83010 6628 3143 0
## 13261 5 68 1 NA NA NA 1871 1836 1
## 14423 6 39 0 63.9 155 26.61559 7144 1581 0
str(nafld1)
## 'data.frame': 17549 obs. of 9 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 57 67 53 56 68 39 49 30 47 79 ...
## $ male : int 0 0 1 1 1 0 0 1 1 0 ...
## $ weight : num 60 70.4 105.8 109.3 NA ...
## $ height : int 163 168 186 170 NA 155 161 180 188 155 ...
## $ bmi : num 22.7 24.9 30.5 37.8 NA ...
## $ case.id: int 10630 14817 3 6628 1871 7144 7507 13417 9 13518 ...
## $ futime : int 6261 624 1783 3143 1836 1581 3109 1339 1671 2239 ...
## $ status : int 0 0 0 0 1 0 0 0 0 1 ...
#Creating survival object
surv_object <- Surv(time = nafld1$futime, event = nafld1$status)
#1) Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year
survfit(surv_object ~ 1, data = nafld1) %>%
ggsurvfit() +
labs(
x = "Days",
y = "Overall survival probability") +
add_confidence_interval()

#Estimate median survival time
survfit(surv_object ~ 1, data = nafld1)
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
##
## n events median 0.95LCL 0.95UCL
## [1,] 17549 1364 NA NA NA
#Median of the survival time is NA because there is never a point where the survival probability is 0.5. It is always above approximately 0.7.
#Estimate survival at 1.5 year
summary(survfit(surv_object ~ 1, data = nafld1),
times = (365.25*1.5))
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 548 16305 246 0.986 0.000909 0.984 0.987
#The surival probability at 1.5 years is approximately 98.6%.
#2) Plot the survival curves of males and females, test the difference between the curves
survfit(surv_object ~ as.factor(male), data = nafld1) %>%
ggsurvfit() +
labs(
x = "Time in Days",
y = "Survival probability"
)

#Test the difference between the curves
sex_diff_surv = survdiff(surv_object ~ as.factor(male), data = nafld1)
print(sex_diff_surv)
## Call:
## survdiff(formula = surv_object ~ as.factor(male), data = nafld1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(male)=0 9348 674 736 5.20 11.3
## as.factor(male)=1 8201 690 628 6.09 11.3
##
## Chisq= 11.3 on 1 degrees of freedom, p= 8e-04
#There are 1147 more females to begin with. Also, there are 16 less females observed compared to the males. We see that the p value is 8e-04 which shows that there is a significance difference in survival depending on the sex.
#3) Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival
age = survdiff(surv_object ~ age, data = nafld1)
age$pvalue
## [1] 0
weight = survdiff(surv_object ~ weight, data = nafld1)
weight$pvalue
## [1] 0
height = survdiff(surv_object ~ height, data = nafld1)
height$pvalue
## [1] 1.631926e-12
bmi = survdiff(surv_object ~ round(bmi, 1), data = nafld1)
bmi$pvalue
## [1] 2.70792e-54
# All the p-values for age, weight, height, bmi are les than 0.05. This means that thes covariates had significant impact on survival.
#4) Report and provide rationale for the best multivariate Cox model.
cox5 = coxph(surv_object ~ male + age + bmi + height + weight, data = nafld1)
bic_5 = BIC(cox5)
aic_5 = AIC(cox5)
cox4 = coxph(surv_object ~ male + age + height + weight, data = nafld1)
bic_4 = BIC(cox4)
aic_4 = AIC(cox4)
cox3 = coxph(surv_object ~ male + age + bmi, data = nafld1)
bic_3 = BIC(cox3)
aic_3 = AIC(cox3)
cox2 = coxph(surv_object ~ male + age, data = nafld1)
bic_2 = BIC(cox2)
aic_2 = AIC(cox2)
cox1 = coxph(surv_object ~ male, data = nafld1)
bic_1 = BIC(cox1)
aic_1 = AIC(cox1)
cat("AIC values:\n")
## AIC values:
cat("cox1 (male):", aic_1, "\n")
## cox1 (male): 24453.86
cat("cox2 (male + age):", aic_2, "\n")
## cox2 (male + age): 22167.86
cat("cox3 (male + age + bmi):", aic_3, "\n")
## cox3 (male + age + bmi): 15982.66
cat("cox4 (male + age + height + weight):", aic_4, "\n")
## cox4 (male + age + height + weight): 15977.05
cat("cox5 (male + age + bmi + height + weight):", aic_5, "\n")
## cox5 (male + age + bmi + height + weight): 15978.94
# Print BICs
cat("\nBIC values:\n")
##
## BIC values:
cat("cox1 (male):", bic_1, "\n")
## cox1 (male): 24459.08
cat("cox2 (male + age):", bic_2, "\n")
## cox2 (male + age): 22178.3
cat("cox3 (male + age + bmi):", bic_3, "\n")
## cox3 (male + age + bmi): 15997.44
cat("cox4 (male + age + height + weight):", bic_4, "\n")
## cox4 (male + age + height + weight): 15996.75
cat("cox5 (male + age + bmi + height + weight):", bic_5, "\n")
## cox5 (male + age + bmi + height + weight): 16003.56
#When I tested the multivariate Cox Model for the AIC and BIC, "cox4" had the lowest AIC and BIC. This models takes into consideration male, age, height, and weight. Therefore, the best multivariate Cox Model is "cox4" which inludes the variables: male, age, height and weight.