surv_object <- Surv(time = nafld1$futime, event = nafld1$status)

head(nafld1)
##       id age male weight height      bmi case.id futime status    sex
## 3631   1  57    0   60.0    163 22.69094   10630   6261      0 Female
## 8458   2  67    0   70.4    168 24.88403   14817    624      0 Female
## 6298   3  53    1  105.8    186 30.45354       3   1783      0   Male
## 15398  4  56    1  109.3    170 37.83010    6628   3143      0   Male
## 13261  5  68    1     NA     NA       NA    1871   1836      1   Male
## 14423  6  39    0   63.9    155 26.61559    7144   1581      0 Female

Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

surv_object <- Surv(nafld1$futime, nafld1$status)

fit_overall <- survfit(surv_object ~ 1, data = nafld1)

plot_overall <- ggsurvplot(
  fit_overall,
  data = nafld1,
  conf.int = TRUE,
  risk.table = TRUE,
  xlab = "Days",
  ylab = "Survival Probability",
  title = "Overall Survival Curve for NAFLD Study Subjects",
  palette = "#E41A1C",
  break.time.by = 2000,
  risk.table.height = 0.25,
  risk.table.title = "Number at Risk",
  risk.table.y.text = FALSE,
  conf.int.alpha = 0.15,
  ggtheme = theme_classic(base_size = 14),
  tables.theme = theme_classic(base_size = 12)
)

plot_overall$plot <- plot_overall$plot +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.title = element_text(face = "bold"),
    legend.position = "top"
  )

plot_overall$table <- plot_overall$table +
  theme(
    axis.title.x = element_text(face = "bold"),
    plot.title = element_text(face = "bold")
  )

plot_overall

median_survival <- summary(fit_overall)$table["median"]

median_survival
## median 
##     NA
survival_1_5_years <- summary(fit_overall, times = 1.5 * 365.25)

survival_1_5_years
## Call: survfit(formula = surv_object ~ 1, data = nafld1)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##   548  16305     246    0.986 0.000909        0.984        0.987

Overall Survival Curve, Median Survival, and Survival at 1.5 Years

The Kaplan-Meier curve shows how survival changes over time for the group. Basically, it tracks the proportion of people who are still alive as time goes on.

In this case, the survival curve stays pretty high and never drops below 50%. Because of that, the median survival time is not reached. This just means that more than half of the people were still alive by the end of the study, so we can’t find the exact point where survival hits 50%.

Survival at 1.5 years was calculated using about 548 days. This gives an estimate of how many people are still alive at that specific time point. It’s basically just reading the survival probability off the curve at that time.

Plot the survival curves of males and females, test the difference between the curves

fit_sex <- survfit(Surv(futime, status) ~ sex, data = nafld1)

plot_sex <- ggsurvplot(
  fit_sex,
  data = nafld1,
  conf.int = TRUE,
  risk.table = TRUE,
  pval = TRUE,

  # Labels
  xlab = "Days",
  ylab = "Survival Probability",
  title = "Survival Curves by Sex",
  legend.title = "Sex",

 
  palette = c("#0072B2", "#D55E00"),

  # Layout fixes
  break.time.by = 2000,
  risk.table.height = 0.25,
  risk.table.title = "Number at Risk",
  risk.table.y.text = FALSE,

 
  ggtheme = theme_classic(base_size = 14),
  tables.theme = theme_classic(base_size = 12)
)


plot_sex$plot <- plot_sex$plot +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.title = element_text(face = "bold"),
    legend.position = "top"
  )

plot_sex$table <- plot_sex$table +
  theme(
    axis.title.x = element_text(face = "bold"),
    plot.title = element_text(face = "bold")
  )

plot_sex

logrank_sex <- survdiff(Surv(futime, status) ~ sex, data = nafld1)

logrank_sex
## Call:
## survdiff(formula = Surv(futime, status) ~ sex, data = nafld1)
## 
##               N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=Female 9348      674      736      5.20      11.3
## sex=Male   8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

Male and Female Survival Curves

Separate survival curves were created for males and females to see if there is a difference between the two groups.

A log-rank test was used to compare them. If the p-value is less than 0.05, then the difference is considered statistically significant. If it’s higher than that, then there isn’t enough evidence to say the groups are different.

So overall, this part is just checking whether males and females have different survival patterns.

Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

cox_age <- coxph(Surv(futime, status) ~ age, data = nafld1)
cox_weight <- coxph(Surv(futime, status) ~ weight, data = nafld1)
cox_height <- coxph(Surv(futime, status) ~ height, data = nafld1)
cox_bmi <- coxph(Surv(futime, status) ~ bmi, data = nafld1)

summary(cox_age)
## Call:
## coxph(formula = Surv(futime, status) ~ age, data = nafld1)
## 
##   n= 17549, number of events= 1364 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)    
## age 0.098273  1.103264 0.002229 44.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.103     0.9064     1.098     1.108
## 
## Concordance= 0.819  (se = 0.007 )
## Likelihood ratio test= 2252  on 1 df,   p=<2e-16
## Wald test            = 1943  on 1 df,   p=<2e-16
## Score (logrank) test = 2190  on 1 df,   p=<2e-16
summary(cox_weight)
## Call:
## coxph(formula = Surv(futime, status) ~ weight, data = nafld1)
## 
##   n= 12763, number of events= 1046 
##    (4786 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## weight -0.005457  0.994558  0.001474 -3.702 0.000214 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## weight    0.9946      1.005    0.9917    0.9974
## 
## Concordance= 0.544  (se = 0.01 )
## Likelihood ratio test= 14.1  on 1 df,   p=2e-04
## Wald test            = 13.7  on 1 df,   p=2e-04
## Score (logrank) test = 13.71  on 1 df,   p=2e-04
summary(cox_height)
## Call:
## coxph(formula = Surv(futime, status) ~ height, data = nafld1)
## 
##   n= 14381, number of events= 1129 
##    (3168 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## height -0.022698  0.977558  0.002941 -7.718 1.18e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## height    0.9776      1.023    0.9719    0.9832
## 
## Concordance= 0.559  (se = 0.01 )
## Likelihood ratio test= 59.8  on 1 df,   p=1e-14
## Wald test            = 59.56  on 1 df,   p=1e-14
## Score (logrank) test = 59.56  on 1 df,   p=1e-14
summary(cox_bmi)
## Call:
## coxph(formula = Surv(futime, status) ~ bmi, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef)  se(coef)     z Pr(>|z|)
## bmi 0.0005176 1.0005177 0.0044921 0.115    0.908
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## bmi     1.001     0.9995    0.9917     1.009
## 
## Concordance= 0.485  (se = 0.011 )
## Likelihood ratio test= 0.01  on 1 df,   p=0.9
## Wald test            = 0.01  on 1 df,   p=0.9
## Score (logrank) test = 0.01  on 1 df,   p=0.9
univariate_results <- data.frame(
  Covariate = c("Age", "Weight", "Height", "BMI"),
  Hazard_Ratio = c(
    exp(coef(cox_age)),
    exp(coef(cox_weight)),
    exp(coef(cox_height)),
    exp(coef(cox_bmi))
  ),
  P_Value = c(
    summary(cox_age)$coefficients[, "Pr(>|z|)"],
    summary(cox_weight)$coefficients[, "Pr(>|z|)"],
    summary(cox_height)$coefficients[, "Pr(>|z|)"],
    summary(cox_bmi)$coefficients[, "Pr(>|z|)"]
  )
)

univariate_results
##        Covariate Hazard_Ratio      P_Value
## age          Age    1.1032643 0.000000e+00
## weight    Weight    0.9945578 2.140244e-04
## height    Height    0.9775576 1.184007e-14
## bmi          BMI    1.0005177 9.082656e-01
cox_m1 <- coxph(Surv(futime, status) ~ age + male, data = nafld1)

cox_m2 <- coxph(Surv(futime, status) ~ age + male + bmi, data = nafld1)

cox_m3 <- coxph(Surv(futime, status) ~ age + male + weight + height, data = nafld1)

summary(cox_m1)
## Call:
## coxph(formula = Surv(futime, status) ~ age + male, data = nafld1)
## 
##   n= 17549, number of events= 1364 
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## age  0.098956  1.104018 0.002227 44.425  < 2e-16 ***
## male 0.372893  1.451929 0.054314  6.866 6.62e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## age      1.104     0.9058     1.099     1.109
## male     1.452     0.6887     1.305     1.615
## 
## Concordance= 0.823  (se = 0.006 )
## Likelihood ratio test= 2299  on 2 df,   p=<2e-16
## Wald test            = 1997  on 2 df,   p=<2e-16
## Score (logrank) test = 2247  on 2 df,   p=<2e-16
summary(cox_m2)
## Call:
## coxph(formula = Surv(futime, status) ~ age + male + bmi, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## age  0.100601  1.105835 0.002650 37.967  < 2e-16 ***
## male 0.366140  1.442158 0.062849  5.826 5.69e-09 ***
## bmi  0.017022  1.017168 0.004945  3.443 0.000576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## age      1.106     0.9043     1.100     1.112
## male     1.442     0.6934     1.275     1.631
## bmi      1.017     0.9831     1.007     1.027
## 
## Concordance= 0.827  (se = 0.007 )
## Likelihood ratio test= 1750  on 3 df,   p=<2e-16
## Wald test            = 1472  on 3 df,   p=<2e-16
## Score (logrank) test = 1699  on 3 df,   p=<2e-16
summary(cox_m3)
## Call:
## coxph(formula = Surv(futime, status) ~ age + male + weight + 
##     height, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## age     0.098361  1.103361  0.002769 35.521  < 2e-16 ***
## male    0.545900  1.726162  0.090392  6.039 1.55e-09 ***
## weight  0.005686  1.005702  0.001832  3.104  0.00191 ** 
## height -0.017951  0.982209  0.004597 -3.905 9.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## age       1.1034     0.9063    1.0974    1.1094
## male      1.7262     0.5793    1.4459    2.0607
## weight    1.0057     0.9943    1.0021    1.0093
## height    0.9822     1.0181    0.9734    0.9911
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1758  on 4 df,   p=<2e-16
## Wald test            = 1490  on 4 df,   p=<2e-16
## Score (logrank) test = 1729  on 4 df,   p=<2e-16
AIC(cox_m1, cox_m2, cox_m3)
##        df      AIC
## cox_m1  2 22167.86
## cox_m2  3 15982.66
## cox_m3  4 15977.05
ph_m1 <- cox.zph(cox_m1)
ph_m2 <- cox.zph(cox_m2)
ph_m3 <- cox.zph(cox_m3)

ph_m1
##        chisq df    p
## age     1.75  1 0.19
## male    1.03  1 0.31
## GLOBAL  2.58  2 0.27
ph_m2
##        chisq df     p
## age    2.323  1 0.127
## male   0.053  1 0.818
## bmi    2.768  1 0.096
## GLOBAL 6.082  3 0.108
ph_m3
##          chisq df      p
## age     2.3751  1 0.1233
## male    0.0274  1 0.8684
## weight  4.3252  1 0.0376
## height  1.4784  1 0.2240
## GLOBAL 13.4690  4 0.0092
plot(ph_m2)

Effects of Age, Weight, Height, and BMI

Each variable was tested one at a time using a Cox model to see how it affects survival.

A hazard ratio greater than 1 means the risk of death increases as the variable increases. A hazard ratio less than 1 means the risk decreases.

From the results:

  • Age clearly increases risk, which makes sense since older people usually have higher mortality
  • Being male also increases risk
  • BMI has a smaller effect but is still significant

Weight and height were looked at too, but BMI is more useful since it already combines both into one measure.

Report and provide rationale for the best multivariate Cox model.

final_model <- coxph(Surv(futime, status) ~ age + male + bmi, data = nafld1)

summary(final_model)
## Call:
## coxph(formula = Surv(futime, status) ~ age + male + bmi, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## age  0.100601  1.105835 0.002650 37.967  < 2e-16 ***
## male 0.366140  1.442158 0.062849  5.826 5.69e-09 ***
## bmi  0.017022  1.017168 0.004945  3.443 0.000576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## age      1.106     0.9043     1.100     1.112
## male     1.442     0.6934     1.275     1.631
## bmi      1.017     0.9831     1.007     1.027
## 
## Concordance= 0.827  (se = 0.007 )
## Likelihood ratio test= 1750  on 3 df,   p=<2e-16
## Wald test            = 1472  on 3 df,   p=<2e-16
## Score (logrank) test = 1699  on 3 df,   p=<2e-16
cox.zph(final_model)
##        chisq df     p
## age    2.323  1 0.127
## male   0.053  1 0.818
## bmi    2.768  1 0.096
## GLOBAL 6.082  3 0.108

Best Multivariate Cox Model

The final model used was:

Surv(futime, status) ~ age + male + bmi

This model was chosen because it keeps things simple while still including the main factors that affect survival.

Age is included since risk generally increases with age. Sex is included because there were differences between males and females. BMI is included because it relates to body size and health.

BMI was used instead of including weight and height separately because putting all three together can cause multicollinearity, meaning the variables overlap too much and make the model harder to interpret.

To evaluate the model, things like p-values, AIC, and the proportional hazards assumption were checked. The results show that the variables are statistically significant and the assumptions are not violated, so the model works well overall.

Final Summary

Overall, the analysis shows that survival in this dataset is relatively high. Age is the strongest predictor of mortality, males have higher risk than females, and BMI has a smaller but still meaningful effect.

The final model using age, sex, and BMI does a good job explaining survival while staying easy to interpret. It’s not overly complicated, and it avoids unnecessary variables that would make the model harder to understand.