Overview

This analysis applies survival analysis methods to the nafld1 dataset from the survival package. The dataset contains baseline data from a population study of non-alcoholic fatty liver disease (NAFLD), with one observation per subject. The outcome of interest is time-to-death (futime, in days), with status indicating whether the event occurred (1) or was censored (0).

data(nafld, package = "survival")

# Create survival object
surv_object <- Surv(nafld1$futime, nafld1$status)

# Quick look at the data
head(nafld1) |> kable(caption = "First 6 rows of nafld1")

First 6 rows of nafld1

	id	age	male	weight	height	bmi	case.id	futime	status
3631	1	57	0	60.0	163	22.69094	10630	6261	0
8458	2	67	0	70.4	168	24.88403	14817	624	0
6298	3	53	1	105.8	186	30.45354	3	1783	0
15398	4	56	1	109.3	170	37.83010	6628	3143	0
13261	5	68	1	NA	NA	NA	1871	1836	1
14423	6	39	0	63.9	155	26.61559	7144	1581	0

1. Overall Survival Curve

km_overall <- survfit(surv_object ~ 1, data = nafld1)

ggsurvplot(
  km_overall,
  data = nafld1,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.28,
  xlab = "Follow-up Time (days)",
  ylab = "Survival Probability",
  title = "Overall Kaplan-Meier Survival Curve",
  ggtheme = theme_minimal()
)

Median Survival Time

km_table <- summary(km_overall)$table
median_surv <- km_table["median"]
cat("Median survival time:", median_surv, "days (", round(median_surv / 365.25, 2), "years )\n")

## Median survival time: NA days ( NA years )

11 The median survival time is the point at which the survival probability crosses 0.50 – i.e., half of subjects have experienced the event by this time.

Survival at 1.5 Years

surv_1.5yr <- summary(km_overall, times = 1.5 * 365.25)

cat(
  "Estimated survival at 1.5 years:\n",
  "  S(t):", round(surv_1.5yr$surv, 4), "\n",
  "  95% CI: [", round(surv_1.5yr$lower, 4), ",", round(surv_1.5yr$upper, 4), "]\n"
)

## Estimated survival at 1.5 years:
##    S(t): 0.9856 
##    95% CI: [ 0.9839 , 0.9874 ]

At 1.5 years (~548 days), the estimated survival probability is 0.986 (95% CI: [0.984, 0.987]).

2. Survival by Sex

km_sex <- survfit(surv_object ~ male, data = nafld1)

ggsurvplot(
  km_sex,
  data = nafld1,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.28,
  legend.labs = c("Female", "Male"),
  xlab = "Follow-up Time (days)",
  ylab = "Survival Probability",
  title = "Survival by Sex",
  palette = c("#E07B5A", "#5A7BE0"),
  ggtheme = theme_minimal()
)

Log-Rank Test

logrank <- survdiff(surv_object ~ male, data = nafld1)
print(logrank)

## Call:
## survdiff(formula = surv_object ~ male, data = nafld1)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348      674      736      5.20      11.3
## male=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

# Extract p-value
p_val <- 1 - pchisq(logrank$chisq, df = 1)
cat("\nLog-rank p-value:", round(p_val, 4))

## 
## Log-rank p-value: 8e-04

The log-rank test yields \(\chi^2\) = 11.295 with p = 8^{-4}. This is statistically significant (p < 0.05), indicating a meaningful difference in survival between males and females.

3. Univariate Cox Models for Continuous Covariates

Each continuous covariate – age, weight, height, and BMI – is tested individually using a Cox proportional hazards model to assess its relationship with survival.

covariates <- c("age", "weight", "height", "bmi")

uni_results <- lapply(covariates, function(var) {
  formula <- as.formula(paste("surv_object ~", var))
  fit <- coxph(formula, data = nafld1)
  s <- summary(fit)
  coef_row <- s$coefficients[1, ]
  conf <- s$conf.int[1, ]
  data.frame(
    Covariate = var,
    HR = round(conf["exp(coef)"], 4),
    Lower_95 = round(conf["lower .95"], 4),
    Upper_95 = round(conf["upper .95"], 4),
    z = round(coef_row["z"], 3),
    p_value = round(coef_row["Pr(>|z|)"], 4)
  )
})

uni_table <- do.call(rbind, uni_results)
rownames(uni_table) <- NULL

kable(
  uni_table,
  col.names = c("Covariate", "Hazard Ratio", "95% CI Lower", "95% CI Upper", "z", "p-value"),
  caption = "Univariate Cox Model Results"
)

Univariate Cox Model Results

Covariate	Hazard Ratio	95% CI Lower	95% CI Upper	z	p-value
age	1.1033	1.0985	1.1081	44.080	0.0000
weight	0.9946	0.9917	0.9974	-3.702	0.0002
height	0.9776	0.9719	0.9832	-7.718	0.0000
bmi	1.0005	0.9917	1.0094	0.115	0.9083

22 A hazard ratio (HR) > 1 indicates higher values of the covariate are associated with increased risk of death; HR < 1 indicates a protective association. Significance assessed at α = 0.05.

Covariates with p < 0.05 are considered statistically significant predictors of survival at the univariate level and are candidates for inclusion in the multivariate model.

4. Multivariate Cox Model

Rationale

Because BMI is mathematically derived from weight and height (\(\text{BMI} = \text{weight} / \text{height}^2\)), including all three in the same model introduces multicollinearity. To address this, two candidate models are compared:

Model A: male + age + bmi (uses the composite measure)
Model B: male + age + weight + height (uses the raw components)

cox_bmi <- coxph(surv_object ~ male + age + bmi, data = nafld1)
cox_wh  <- coxph(surv_object ~ male + age + weight + height, data = nafld1)

cat("=== Model A: male + age + bmi ===\n")

## === Model A: male + age + bmi ===

summary(cox_bmi)

## Call:
## coxph(formula = surv_object ~ male + age + bmi, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## male 0.366140  1.442158 0.062849  5.826 5.69e-09 ***
## age  0.100601  1.105835 0.002650 37.967  < 2e-16 ***
## bmi  0.017022  1.017168 0.004945  3.443 0.000576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## male     1.442     0.6934     1.275     1.631
## age      1.106     0.9043     1.100     1.112
## bmi      1.017     0.9831     1.007     1.027
## 
## Concordance= 0.827  (se = 0.007 )
## Likelihood ratio test= 1750  on 3 df,   p=<2e-16
## Wald test            = 1472  on 3 df,   p=<2e-16
## Score (logrank) test = 1699  on 3 df,   p=<2e-16

cat("\n=== Model B: male + age + weight + height ===\n")

## 
## === Model B: male + age + weight + height ===

summary(cox_wh)

## Call:
## coxph(formula = surv_object ~ male + age + weight + height, data = nafld1)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z Pr(>|z|)    
## male    0.545900  1.726162  0.090392  6.039 1.55e-09 ***
## age     0.098361  1.103361  0.002769 35.521  < 2e-16 ***
## weight  0.005686  1.005702  0.001832  3.104  0.00191 ** 
## height -0.017951  0.982209  0.004597 -3.905 9.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## male      1.7262     0.5793    1.4459    2.0607
## age       1.1034     0.9063    1.0974    1.1094
## weight    1.0057     0.9943    1.0021    1.0093
## height    0.9822     1.0181    0.9734    0.9911
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1758  on 4 df,   p=<2e-16
## Wald test            = 1490  on 4 df,   p=<2e-16
## Score (logrank) test = 1729  on 4 df,   p=<2e-16

AIC Comparison

aic_table <- AIC(cox_bmi, cox_wh)
rownames(aic_table) <- c("Model A (bmi)", "Model B (weight + height)")
kable(aic_table, caption = "AIC Comparison of Candidate Models")

AIC Comparison of Candidate Models

	df	AIC
Model A (bmi)	3	15982.66
Model B (weight + height)	4	15977.05

The model with the lower AIC is preferred as the best balance of fit and parsimony.

Proportional Hazards Assumption

# Test on the best model (update to whichever wins AIC)
best_model <- if (AIC(cox_bmi) <= AIC(cox_wh)) cox_bmi else cox_wh

ph_test <- cox.zph(best_model)
print(ph_test)

##          chisq df      p
## male    0.0274  1 0.8684
## age     2.3751  1 0.1233
## weight  4.3252  1 0.0376
## height  1.4784  1 0.2240
## GLOBAL 13.4690  4 0.0092

33 The proportional hazards (PH) assumption requires that the hazard ratio between groups remains constant over time. A significant p-value (< 0.05) in the Schoenfeld residuals test indicates a violation for that covariate.

Best Model Summary

s <- summary(best_model)
final_table <- data.frame(
  Covariate = rownames(s$coefficients),
  HR = round(s$conf.int[, "exp(coef)"], 4),
  Lower_95 = round(s$conf.int[, "lower .95"], 4),
  Upper_95 = round(s$conf.int[, "upper .95"], 4),
  p_value = round(s$coefficients[, "Pr(>|z|)"], 4)
)
rownames(final_table) <- NULL

kable(
  final_table,
  col.names = c("Covariate", "Hazard Ratio", "95% CI Lower", "95% CI Upper", "p-value"),
  caption = "Final Multivariate Cox Model -- Hazard Ratios"
)

Final Multivariate Cox Model – Hazard Ratios

Covariate	Hazard Ratio	95% CI Lower	95% CI Upper	p-value
male	1.7262	1.4459	2.0607	0.0000
age	1.1034	1.0974	1.1094	0.0000
weight	1.0057	1.0021	1.0093	0.0019
height	0.9822	0.9734	0.9911	0.0001

The final model was selected based on:

AIC – lower AIC favors the more parsimonious model
Multicollinearity avoidance – BMI vs. raw weight/height compared directly
Interpretability – hazard ratios should be clinically or practically meaningful
PH assumption – covariates violating the proportional hazards assumption are flagged

Analysis conducted in R using the survival and survminer packages.

Survival Analysis

Non-Alcoholic Fatty Liver Disease (NAFLD)

Olivia Shipley

2026-04-28