install.packages("survminer")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'survminer' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\RtmpCqo4x0\downloaded_packages
install.packages("ggsurvfit")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggsurvfit' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\RtmpCqo4x0\downloaded_packages
library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.4.3
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.4.3
data(nafld, package = "survival")
head(nafld1)
##       id age male weight height      bmi case.id futime status
## 3631   1  57    0   60.0    163 22.69094   10630   6261      0
## 8458   2  67    0   70.4    168 24.88403   14817    624      0
## 6298   3  53    1  105.8    186 30.45354       3   1783      0
## 15398  4  56    1  109.3    170 37.83010    6628   3143      0
## 13261  5  68    1     NA     NA       NA    1871   1836      1
## 14423  6  39    0   63.9    155 26.61559    7144   1581      0
str(nafld1)
## 'data.frame':    17549 obs. of  9 variables:
##  $ id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age    : int  57 67 53 56 68 39 49 30 47 79 ...
##  $ male   : int  0 0 1 1 1 0 0 1 1 0 ...
##  $ weight : num  60 70.4 105.8 109.3 NA ...
##  $ height : int  163 168 186 170 NA 155 161 180 188 155 ...
##  $ bmi    : num  22.7 24.9 30.5 37.8 NA ...
##  $ case.id: int  10630 14817 3 6628 1871 7144 7507 13417 9 13518 ...
##  $ futime : int  6261 624 1783 3143 1836 1581 3109 1339 1671 2239 ...
##  $ status : int  0 0 0 0 1 0 0 0 0 1 ...
#Creating survival object
surv_object <- Surv(time = nafld1$futime, event = nafld1$status)
#1) Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year
survfit(surv_object ~ 1, data = nafld1) %>% 
  ggsurvfit() +
  labs(
    x = "Days",
    y = "Overall survival probability") +
  add_confidence_interval()

#Estimate median survival time
survfit(surv_object ~ 1, data = nafld1)
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
## 
##          n events median 0.95LCL 0.95UCL
## [1,] 17549   1364     NA      NA      NA
#Median of the survival time is NA because there is never a point where the survival probability is 0.5. It is always above approximately 0.7.
#Estimate survival at 1.5 year
summary(survfit(surv_object ~ 1, data = nafld1), 
        times = (365.25*1.5))
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##   548  16305     246    0.986 0.000909        0.984        0.987
#The surival probability at 1.5 years is approximately 98.6%.
#2) Plot the survival curves of males and females, test the difference between the curves
survfit(surv_object ~ as.factor(male), data = nafld1) %>% 
  ggsurvfit() +
  labs(
    x = "Time in Days",
    y = "Survival probability"
  )

#Test the difference between the curves
sex_diff_surv = survdiff(surv_object ~ as.factor(male), data = nafld1)
print(sex_diff_surv)
## Call:
## survdiff(formula = surv_object ~ as.factor(male), data = nafld1)
## 
##                      N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(male)=0 9348      674      736      5.20      11.3
## as.factor(male)=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04
#There are 1147 more females to begin with. Also, there are 16 less females observed compared to the males. We see that the p value is 8e-04 which shows that there is a significance  difference in survival depending on the sex.
#3) Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival
age = survdiff(surv_object ~ age, data = nafld1)
age$pvalue
## [1] 0
weight = survdiff(surv_object ~ weight, data = nafld1)
weight$pvalue
## [1] 0
height = survdiff(surv_object ~ height, data = nafld1)
height$pvalue
## [1] 1.631926e-12
bmi = survdiff(surv_object ~ round(bmi, 1), data = nafld1)
bmi$pvalue
## [1] 2.70792e-54
# All the p-values for age, weight, height, bmi are les than 0.05. This means that thes covariates had significant impact on survival. 
#4) Report and provide rationale for the best multivariate Cox model.
cox5 = coxph(surv_object ~ male + age + bmi + height + weight, data = nafld1)
bic_5 = BIC(cox5)
aic_5 = AIC(cox5)

cox4 = coxph(surv_object ~ male + age + height + weight, data = nafld1)
bic_4 = BIC(cox4)
aic_4 = AIC(cox4)

cox3 = coxph(surv_object ~ male + age + bmi, data = nafld1)
bic_3 = BIC(cox3)
aic_3 = AIC(cox3)

cox2 = coxph(surv_object ~ male + age, data = nafld1)
bic_2 = BIC(cox2)
aic_2 = AIC(cox2)

cox1 = coxph(surv_object ~ male, data = nafld1)
bic_1 = BIC(cox1)
aic_1 = AIC(cox1)

cat("AIC values:\n")
## AIC values:
cat("cox1 (male):", aic_1, "\n")
## cox1 (male): 24453.86
cat("cox2 (male + age):", aic_2, "\n")
## cox2 (male + age): 22167.86
cat("cox3 (male + age + bmi):", aic_3, "\n")
## cox3 (male + age + bmi): 15982.66
cat("cox4 (male + age + height + weight):", aic_4, "\n")
## cox4 (male + age + height + weight): 15977.05
cat("cox5 (male + age + bmi + height + weight):", aic_5, "\n")
## cox5 (male + age + bmi + height + weight): 15978.94
# Print BICs
cat("\nBIC values:\n")
## 
## BIC values:
cat("cox1 (male):", bic_1, "\n")
## cox1 (male): 24459.08
cat("cox2 (male + age):", bic_2, "\n")
## cox2 (male + age): 22178.3
cat("cox3 (male + age + bmi):", bic_3, "\n")
## cox3 (male + age + bmi): 15997.44
cat("cox4 (male + age + height + weight):", bic_4, "\n")
## cox4 (male + age + height + weight): 15996.75
cat("cox5 (male + age + bmi + height + weight):", bic_5, "\n")
## cox5 (male + age + bmi + height + weight): 16003.56
#When I tested the multivariate Cox Model for the AIC and BIC, "cox4" had the lowest AIC and BIC. This models takes into consideration male, age, height, and weight. Therefore, the best multivariate Cox Model is "cox4" which inludes the variables: male, age, height and weight.