1 Introduction

This document provides a comprehensive survival analysis workflow using the veteran dataset from the survival package. The dataset contains data from a randomized trial of two treatment regimens for lung cancer.

2 Load Required Packages

# Install packages if needed
required_packages <- c(
  "survival", "survminer", "ggplot2", "dplyr", 
  "tidyr", "gtsummary", "ggsurvfit", "gridExtra",
  "survMisc", "flexsurv", "knitr", "kableExtra"
)

# Install missing packages
installed <- installed.packages()[, "Package"]
to_install <- required_packages[!required_packages %in% installed]
if(length(to_install) > 0) {
  install.packages(to_install)
}

# Load packages
library(survival)
library(survminer)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gtsummary)
library(ggsurvfit)
library(gridExtra)
library(survMisc)
library(flexsurv)
library(knitr)
library(kableExtra)

3 Data Loading and Exploration

3.1 Load the Veteran Dataset

# Load veteran dataset
data(veteran)

# Create a working copy
df <- veteran

# Display structure
str(df)

## 'data.frame':    137 obs. of  8 variables:
##  $ trt     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ celltype: Factor w/ 4 levels "squamous","smallcell",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time    : num  72 411 228 126 118 10 82 110 314 100 ...
##  $ status  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ karno   : num  60 70 60 60 70 20 40 80 50 70 ...
##  $ diagtime: num  7 5 3 9 11 5 10 29 18 6 ...
##  $ age     : num  69 64 38 63 65 49 69 68 43 70 ...
##  $ prior   : num  0 10 0 10 10 0 10 0 0 0 ...

# View first few rows
head(df) %>%
  kable(caption = "First 6 rows of veteran dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

First 6 rows of veteran dataset
trt	celltype	time	status	karno	diagtime	age	prior
1	squamous	72	1	60	7	69	0
1	squamous	411	1	70	5	64	10
1	squamous	228	1	60	3	38	0
1	squamous	126	1	60	9	63	10
1	squamous	118	1	70	11	65	10
1	squamous	10	1	20	5	49	0

3.2 Data Description

The veteran dataset contains the following variables:

trt: Treatment (1=standard, 2=test)
celltype: Cell type (squamous, smallcell, adeno, large)
time: Survival time in days
status: Status (0=censored, 1=dead)
karno: Karnofsky performance score (0-100)
diagtime: Months from diagnosis to randomization
age: Age in years
prior: Prior therapy (0=no, 10=yes)

3.3 Data Preprocessing

# Convert variables to factors
df <- df %>%
  mutate(
    trt = factor(trt, levels = c(1, 2), 
                 labels = c("Standard", "Test")),
    celltype = factor(celltype),
    prior = factor(prior, levels = c(0, 10), 
                   labels = c("No", "Yes"))
  )

# Summary statistics
summary(df)

##        trt          celltype       time           status           karno      
##  Standard:69   squamous :35   Min.   :  1.0   Min.   :0.0000   Min.   :10.00  
##  Test    :68   smallcell:48   1st Qu.: 25.0   1st Qu.:1.0000   1st Qu.:40.00  
##                adeno    :27   Median : 80.0   Median :1.0000   Median :60.00  
##                large    :27   Mean   :121.6   Mean   :0.9343   Mean   :58.57  
##                               3rd Qu.:144.0   3rd Qu.:1.0000   3rd Qu.:75.00  
##                               Max.   :999.0   Max.   :1.0000   Max.   :99.00  
##     diagtime           age        prior   
##  Min.   : 1.000   Min.   :34.00   No :97  
##  1st Qu.: 3.000   1st Qu.:51.00   Yes:40  
##  Median : 5.000   Median :62.00           
##  Mean   : 8.774   Mean   :58.31           
##  3rd Qu.:11.000   3rd Qu.:66.00           
##  Max.   :87.000   Max.   :81.00

3.4 Descriptive Statistics

# Create summary table
df %>%
  select(time, status, age, karno, diagtime) %>%
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      time ~ "Survival Time (days)",
      status ~ "Death",
      age ~ "Age (years)",
      karno ~ "Karnofsky Score",
      diagtime ~ "Diagnosis Time (months)"
    )
  ) %>%
  add_n() %>%
  bold_labels()

Characteristic	N	N = 137¹
Survival Time (days)	137	122 (158)
Death	137	128 (93%)
Age (years)	137	58 (11)
Karnofsky Score	137	59 (20)
Diagnosis Time (months)	137	9 (11)
¹ Mean (SD); n (%)

4 Kaplan-Meier Survival Analysis

4.1 Overall Survival Curve

# Fit overall survival
fit_overall <- survfit(Surv(time, status) ~ 1, data = df)

# Summary
summary(fit_overall, times = c(30, 60, 90, 180, 365))

## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    30     97      41    0.700  0.0392       0.6277        0.782
##    60     73      22    0.538  0.0427       0.4607        0.629
##    90     62      10    0.464  0.0428       0.3873        0.556
##   180     27      30    0.222  0.0369       0.1607        0.308
##   365     10      15    0.090  0.0265       0.0506        0.160

# Plot using survminer
ggsurvplot(
  fit_overall,
  data = df,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.25,
  ggtheme = theme_minimal(),
  palette = "blue",
  title = "Overall Kaplan-Meier Survival Curve",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend = "none"
)

4.2 Median Survival Time

# Calculate median survival
median_surv <- survfit(Surv(time, status) ~ 1, data = df)
print(median_surv)

## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 137    128     80      52     105

# Use gtsummary for clean table
survfit(Surv(time, status) ~ 1, data = df) %>%
  tbl_survfit(
    probs = 0.5,
    label_header = "**Median Survival (95% CI)**"
  )

Characteristic	Median Survival (95% CI)
Overall	80 (52, 105)

4.3 Survival by Treatment

# Fit survival by treatment
fit_trt <- survfit(Surv(time, status) ~ trt, data = df)

# Plot
ggsurvplot(
  fit_trt,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.3,
  ggtheme = theme_minimal(),
  palette = c("#E7B800", "#2E9FDF"),
  title = "Survival by Treatment Group",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend.title = "Treatment",
  legend.labs = c("Standard", "Test")
)

4.4 Survival by Cell Type

# Fit survival by cell type
fit_cell <- survfit(Surv(time, status) ~ celltype, data = df)

# Plot
ggsurvplot(
  fit_cell,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.3,
  ggtheme = theme_minimal(),
  title = "Survival by Cell Type",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend.title = "Cell Type"
)

4.5 Log-Rank Test

# Log-rank test for treatment
logrank_trt <- survdiff(Surv(time, status) ~ trt, data = df)
print(logrank_trt)

## Call:
## survdiff(formula = Surv(time, status) ~ trt, data = df)
## 
##               N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=Standard 69       64     64.5   0.00388   0.00823
## trt=Test     68       64     63.5   0.00394   0.00823
## 
##  Chisq= 0  on 1 degrees of freedom, p= 0.9

# Log-rank test for cell type
logrank_cell <- survdiff(Surv(time, status) ~ celltype, data = df)
print(logrank_cell)

## Call:
## survdiff(formula = Surv(time, status) ~ celltype, data = df)
## 
##                     N Observed Expected (O-E)^2/E (O-E)^2/V
## celltype=squamous  35       31     47.7      5.82     10.53
## celltype=smallcell 48       45     30.1      7.37     10.20
## celltype=adeno     27       26     15.7      6.77      8.19
## celltype=large     27       26     34.5      2.12      3.02
## 
##  Chisq= 25.4  on 3 degrees of freedom, p= 1e-05

# Log-rank test for prior therapy
logrank_prior <- survdiff(Surv(time, status) ~ prior, data = df)
print(logrank_prior)

## Call:
## survdiff(formula = Surv(time, status) ~ prior, data = df)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## prior=No  97       91     87.4     0.150     0.501
## prior=Yes 40       37     40.6     0.323     0.501
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5

5 Cumulative Hazard Function

5.1 Overall Cumulative Hazard

# Calculate cumulative hazard
fit_overall <- survfit(Surv(time, status) ~ 1, data = df)

# Plot cumulative hazard
ggsurvplot(
  fit_overall,
  data = df,
  fun = "cumhaz",
  conf.int = TRUE,
  ggtheme = theme_minimal(),
  palette = "red",
  title = "Cumulative Hazard Function",
  xlab = "Time (days)",
  ylab = "Cumulative Hazard",
  break.time.by = 100
)

5.2 Cumulative Hazard by Treatment

# Plot cumulative hazard by treatment
ggsurvplot(
  fit_trt,
  data = df,
  fun = "cumhaz",
  conf.int = TRUE,
  ggtheme = theme_minimal(),
  palette = c("#E7B800", "#2E9FDF"),
  title = "Cumulative Hazard by Treatment",
  xlab = "Time (days)",
  ylab = "Cumulative Hazard",
  break.time.by = 100,
  legend.title = "Treatment"
)

5.3 Cumulative Hazard by Cell Type

# Plot cumulative hazard by cell type
ggsurvplot(
  fit_cell,
  data = df,
  fun = "cumhaz",
  conf.int = TRUE,
  ggtheme = theme_minimal(),
  title = "Cumulative Hazard by Cell Type",
  xlab = "Time (days)",
  ylab = "Cumulative Hazard",
  break.time.by = 100,
  legend.title = "Cell Type"
)

6 Cox Proportional Hazards Regression

6.1 Univariate Cox Regression

# Univariate Cox models for each variable
variables <- c("trt", "celltype", "karno", "diagtime", "age", "prior")

# Create table of univariate results
tbl_uvregression(
  df %>% select(time, status, all_of(variables)),
  method = coxph,
  y = Surv(time, status),
  exponentiate = TRUE,
  pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
  bold_p() %>%
  bold_labels()

Characteristic	N	HR	95% CI	p-value
trt	137
Standard		—	—
Test		1.02	0.71, 1.45	0.922
celltype	137
squamous		—	—
smallcell		2.72	1.66, 4.47	<0.001
adeno		3.15	1.77, 5.59	<0.001
large		1.26	0.73, 2.17	0.407
karno	137	0.97	0.96, 0.98	<0.001
diagtime	137	1.01	0.99, 1.03	0.311
age	137	1.01	0.99, 1.03	0.433
prior	137
No		—	—
Yes		0.87	0.59, 1.28	0.476
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio

6.2 Multivariate Cox Regression

6.2.1 Model 1: All Variables

# Fit multivariate Cox model
cox_mv <- coxph(
  Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
  data = df
)

# Summary
summary(cox_mv)

## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df)
## 
##   n= 137, number of events= 128 
## 
##                         coef  exp(coef)   se(coef)      z Pr(>|z|)    
## trtTest            2.946e-01  1.343e+00  2.075e-01  1.419  0.15577    
## celltypesmallcell  8.616e-01  2.367e+00  2.753e-01  3.130  0.00175 ** 
## celltypeadeno      1.196e+00  3.307e+00  3.009e-01  3.975 7.05e-05 ***
## celltypelarge      4.013e-01  1.494e+00  2.827e-01  1.420  0.15574    
## karno             -3.282e-02  9.677e-01  5.508e-03 -5.958 2.55e-09 ***
## diagtime           8.132e-05  1.000e+00  9.136e-03  0.009  0.99290    
## age               -8.706e-03  9.913e-01  9.300e-03 -0.936  0.34920    
## priorYes           7.159e-02  1.074e+00  2.323e-01  0.308  0.75794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## trtTest              1.3426     0.7448    0.8939    2.0166
## celltypesmallcell    2.3669     0.4225    1.3799    4.0597
## celltypeadeno        3.3071     0.3024    1.8336    5.9647
## celltypelarge        1.4938     0.6695    0.8583    2.5996
## karno                0.9677     1.0334    0.9573    0.9782
## diagtime             1.0001     0.9999    0.9823    1.0182
## age                  0.9913     1.0087    0.9734    1.0096
## priorYes             1.0742     0.9309    0.6813    1.6937
## 
## Concordance= 0.736  (se = 0.021 )
## Likelihood ratio test= 62.1  on 8 df,   p=2e-10
## Wald test            = 62.37  on 8 df,   p=2e-10
## Score (logrank) test = 66.74  on 8 df,   p=2e-11

# Create formatted table
tbl_regression(
  cox_mv,
  exponentiate = TRUE,
  pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
  bold_p() %>%
  bold_labels()

Characteristic	HR	95% CI	p-value
trt
Standard	—	—
Test	1.34	0.89, 2.02	0.156
celltype
squamous	—	—
smallcell	2.37	1.38, 4.06	0.002
adeno	3.31	1.83, 5.96	<0.001
large	1.49	0.86, 2.60	0.156
karno	0.97	0.96, 0.98	<0.001
diagtime	1.00	0.98, 1.02	0.993
age	0.99	0.97, 1.01	0.349
prior
No	—	—
Yes	1.07	0.68, 1.69	0.758
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio

6.2.2 Model Comparison with AIC

# Fit different models
cox_m1 <- coxph(Surv(time, status) ~ trt, data = df)
cox_m2 <- coxph(Surv(time, status) ~ trt + celltype, data = df)
cox_m3 <- coxph(Surv(time, status) ~ trt + celltype + karno, data = df)
cox_m4 <- cox_mv  # Full model

# Compare AIC
data.frame(
  Model = c("Treatment only", 
            "Treatment + Cell Type", 
            "Treatment + Cell Type + Karno",
            "Full Model"),
  AIC = c(AIC(cox_m1), AIC(cox_m2), AIC(cox_m3), AIC(cox_m4)),
  Variables = c(1, 2, 3, 7)
) %>%
  arrange(AIC) %>%
  kable(caption = "Model Comparison using AIC") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Model Comparison using AIC
Model	AIC	Variables
Treatment + Cell Type + Karno	959.8290	3
Full Model	964.7942	7
Treatment + Cell Type	993.0407	2
Treatment only	1012.8885	1

6.3 Forest Plot

# Forest plot for multivariate model
ggforest(
  cox_mv,
  data = df,
  main = "Hazard Ratios with 95% Confidence Intervals",
  fontsize = 0.8
)

7 Testing Proportional Hazards Assumption

7.1 Schoenfeld Residuals Test

# Test proportional hazards assumption
ph_test <- cox.zph(cox_mv)
print(ph_test)

##            chisq df       p
## trt       0.2644  1 0.60712
## celltype 15.2274  3 0.00163
## karno    12.9352  1 0.00032
## diagtime  0.0129  1 0.90961
## age       1.8288  1 0.17627
## prior     2.1656  1 0.14113
## GLOBAL   34.5525  8 3.2e-05

# Interpretation table
data.frame(
  Variable = rownames(ph_test$table),
  ChiSq = ph_test$table[, "chisq"],
  P_value = ph_test$table[, "p"],
  Conclusion = ifelse(
    ph_test$table[, "p"] > 0.05,
    "Assumption Met",
    "Assumption Violated"
  )
) %>%
  kable(caption = "Proportional Hazards Assumption Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(which(ph_test$table[, "p"] <= 0.05), 
           background = "#ffcccc")

Proportional Hazards Assumption Test
	Variable	ChiSq	P_value	Conclusion
trt	trt	0.2643888	0.6071209	Assumption Met
celltype	celltype	15.2273639	0.0016323	Assumption Violated
karno	karno	12.9351978	0.0003225	Assumption Violated
diagtime	diagtime	0.0128894	0.9096093	Assumption Met
age	age	1.8288266	0.1762662	Assumption Met
prior	prior	2.1655663	0.1411326	Assumption Met
GLOBAL	GLOBAL	34.5524911	0.0000323	Assumption Violated

7.2 Schoenfeld Residuals Plots

# Plot Schoenfeld residuals
par(mfrow = c(3, 3))
plot(ph_test)
par(mfrow = c(1, 1))

7.3 Alternative: Log-Log Survival Plots

# Log-log survival plot for treatment
ggsurvplot(
  fit_trt,
  data = df,
  fun = "cloglog",
  ggtheme = theme_minimal(),
  palette = c("#E7B800", "#2E9FDF"),
  title = "Log-Log Survival Plot by Treatment",
  xlab = "Log(Time)",
  ylab = "Log(-Log(Survival))",
  legend.title = "Treatment"
)

# Log-log survival plot for cell type
ggsurvplot(
  fit_cell,
  data = df,
  fun = "cloglog",
  ggtheme = theme_minimal(),
  title = "Log-Log Survival Plot by Cell Type",
  xlab = "Log(Time)",
  ylab = "Log(-Log(Survival))",
  legend.title = "Cell Type"
)

8 Model Diagnostics

8.1 Cox-Snell Residuals

# Calculate Cox-Snell residuals
cox_snell <- df$status - residuals(cox_mv, type = "martingale")

# Fit survival to Cox-Snell residuals
cs_fit <- survfit(Surv(cox_snell, df$status) ~ 1)

# Plot
plot(
  cs_fit$time, 
  -log(cs_fit$surv),
  type = "s",
  xlab = "Cox-Snell Residuals",
  ylab = "Cumulative Hazard",
  main = "Cox-Snell Residual Plot\n(Should follow 45-degree line)"
)
abline(0, 1, col = "red", lty = 2)
legend("bottomright", 
       legend = "45-degree reference line", 
       col = "red", 
       lty = 2)

8.2 Martingale Residuals

# Calculate martingale residuals
mart_resid <- residuals(cox_mv, type = "martingale")

# Plot against continuous variables
par(mfrow = c(2, 2))

plot(df$karno, mart_resid,
     xlab = "Karnofsky Score",
     ylab = "Martingale Residuals",
     main = "Martingale Residuals vs Karno")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$karno, mart_resid), col = "blue", lwd = 2)

plot(df$age, mart_resid,
     xlab = "Age",
     ylab = "Martingale Residuals",
     main = "Martingale Residuals vs Age")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$age, mart_resid), col = "blue", lwd = 2)

plot(df$diagtime, mart_resid,
     xlab = "Diagnosis Time",
     ylab = "Martingale Residuals",
     main = "Martingale Residuals vs Diagtime")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$diagtime, mart_resid), col = "blue", lwd = 2)

par(mfrow = c(1, 1))

8.3 Deviance Residuals

# Calculate deviance residuals
dev_resid <- residuals(cox_mv, type = "deviance")

# Plot deviance residuals
plot(
  predict(cox_mv),
  dev_resid,
  xlab = "Linear Predictor",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs Linear Predictor"
)
abline(h = 0, col = "red", lty = 2)
lines(lowess(predict(cox_mv), dev_resid), col = "blue", lwd = 2)

# Identify potential outliers
outliers <- which(abs(dev_resid) > 3)
if(length(outliers) > 0) {
  text(
    predict(cox_mv)[outliers],
    dev_resid[outliers],
    labels = outliers,
    pos = 3,
    col = "red"
  )
}

8.4 Influential Observations (DFBETAs)

# Calculate DFBETAs
dfbeta_vals <- residuals(cox_mv, type = "dfbeta")

# Check column names
colnames(dfbeta_vals)

## NULL

# Plot for key variables (use actual column names)
par(mfrow = c(2, 2))

# Treatment effect
trt_col <- grep("trt", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(trt_col)) {
  plot(dfbeta_vals[, trt_col],
       ylab = "DFBETAs for Treatment",
       main = "Influence on Treatment Effect",
       type = "h")
  abline(h = 0, col = "red", lty = 2)
}

# Karno effect
karno_col <- grep("karno", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(karno_col)) {
  plot(dfbeta_vals[, karno_col],
       ylab = "DFBETAs for Karno",
       main = "Influence on Karno Effect",
       type = "h")
  abline(h = 0, col = "red", lty = 2)
}

# Age effect
age_col <- grep("age", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(age_col)) {
  plot(dfbeta_vals[, age_col],
       ylab = "DFBETAs for Age",
       main = "Influence on Age Effect",
       type = "h")
  abline(h = 0, col = "red", lty = 2)
}

par(mfrow = c(1, 1))

9 Stratified Cox Model

If proportional hazards assumption is violated for a variable, we can stratify by it.

# Example: Stratify by cell type if assumption violated
cox_strat <- coxph(
  Surv(time, status) ~ trt + strata(celltype) + karno + diagtime + age + prior,
  data = df
)

summary(cox_strat)

## Call:
## coxph(formula = Surv(time, status) ~ trt + strata(celltype) + 
##     karno + diagtime + age + prior, data = df)
## 
##   n= 137, number of events= 128 
## 
##               coef exp(coef)  se(coef)      z Pr(>|z|)    
## trtTest   0.285902  1.330962  0.210009  1.361    0.173    
## karno    -0.038262  0.962461  0.005932 -6.450 1.12e-10 ***
## diagtime -0.003439  0.996567  0.009075 -0.379    0.705    
## age      -0.011821  0.988249  0.009846 -1.201    0.230    
## priorYes  0.169069  1.184201  0.235667  0.717    0.473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## trtTest     1.3310     0.7513    0.8819    2.0087
## karno       0.9625     1.0390    0.9513    0.9737
## diagtime    0.9966     1.0034    0.9790    1.0145
## age         0.9882     1.0119    0.9694    1.0075
## priorYes    1.1842     0.8445    0.7461    1.8794
## 
## Concordance= 0.704  (se = 0.027 )
## Likelihood ratio test= 44.27  on 5 df,   p=2e-08
## Wald test            = 43.63  on 5 df,   p=3e-08
## Score (logrank) test = 47.01  on 5 df,   p=6e-09

# Compare with unstratified model
data.frame(
  Model = c("Unstratified", "Stratified by Cell Type"),
  AIC = c(AIC(cox_mv), AIC(cox_strat)),
  Concordance = c(
    summary(cox_mv)$concordance[1],
    summary(cox_strat)$concordance[1]
  )
) %>%
  kable(caption = "Comparison of Stratified vs Unstratified Models") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Comparison of Stratified vs Unstratified Models
Model	AIC	Concordance
Unstratified	964.7942	0.7360291
Stratified by Cell Type	643.2026	0.7038814

10 Parametric Survival Models

10.1 Exponential Model

# Fit exponential model
exp_model <- survreg(
  Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
  data = df,
  dist = "exponential"
)

summary(exp_model)

## 
## Call:
## survreg(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df, dist = "exponential")
##                       Value Std. Error     z       p
## (Intercept)        3.188611   0.704033  4.53 5.9e-06
## trtTest           -0.219565   0.198634 -1.11  0.2690
## celltypesmallcell -0.820245   0.262111 -3.13  0.0018
## celltypeadeno     -1.113121   0.275825 -4.04 5.4e-05
## celltypelarge     -0.377220   0.272626 -1.38  0.1665
## karno              0.030624   0.005108  6.00 2.0e-09
## diagtime          -0.000297   0.008970 -0.03  0.9736
## age                0.006108   0.009161  0.67  0.5049
## priorYes          -0.049482   0.226871 -0.22  0.8273
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -716.2   Loglik(intercept only)= -751.2
##  Chisq= 70.12 on 8 degrees of freedom, p= 4.6e-12 
## Number of Newton-Raphson Iterations: 5 
## n= 137

10.2 Weibull Model

# Fit Weibull model
weibull_model <- survreg(
  Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
  data = df,
  dist = "weibull"
)

summary(weibull_model)

## 
## Call:
## survreg(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df, dist = "weibull")
##                       Value Std. Error     z       p
## (Intercept)        3.262014   0.662531  4.92 8.5e-07
## trtTest           -0.228523   0.186844 -1.22  0.2213
## celltypesmallcell -0.826185   0.246312 -3.35  0.0008
## celltypeadeno     -1.132725   0.257598 -4.40 1.1e-05
## celltypelarge     -0.397681   0.254749 -1.56  0.1185
## karno              0.030068   0.004828  6.23 4.7e-10
## diagtime          -0.000469   0.008361 -0.06  0.9553
## age                0.006099   0.008553  0.71  0.4758
## priorYes          -0.043898   0.212279 -0.21  0.8362
## Log(scale)        -0.074599   0.066311 -1.12  0.2606
## 
## Scale= 0.928 
## 
## Weibull distribution
## Loglik(model)= -715.6   Loglik(intercept only)= -748.1
##  Chisq= 65.08 on 8 degrees of freedom, p= 4.7e-11 
## Number of Newton-Raphson Iterations: 6 
## n= 137

# Compare AIC
data.frame(
  Model = c("Exponential", "Weibull", "Cox PH"),
  AIC = c(AIC(exp_model), AIC(weibull_model), AIC(cox_mv))
) %>%
  arrange(AIC) %>%
  kable(caption = "Comparison of Parametric Models") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Comparison of Parametric Models
Model	AIC
Cox PH	964.7942
Exponential	1450.3182
Weibull	1451.1027

11 Risk Score and Predicted Survival

11.1 Calculate Risk Score

# Calculate risk score (linear predictor)
df$risk_score <- predict(cox_mv, type = "lp")

# Categorize into risk groups
df$risk_group <- cut(
  df$risk_score,
  breaks = quantile(df$risk_score, probs = c(0, 0.33, 0.67, 1)),
  labels = c("Low", "Medium", "High"),
  include.lowest = TRUE
)

# Summary by risk group
table(df$risk_group) %>%
  kable(col.names = c("Risk Group", "N"), 
        caption = "Distribution of Risk Groups") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Distribution of Risk Groups
Risk Group	N
Low	45
Medium	47
High	45

11.2 Survival by Risk Group

# Fit survival by risk group
fit_risk <- survfit(Surv(time, status) ~ risk_group, data = df)

# Plot
ggsurvplot(
  fit_risk,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.3,
  ggtheme = theme_minimal(),
  palette = c("green", "orange", "red"),
  title = "Survival by Predicted Risk Group",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend.title = "Risk Group",
  legend.labs = c("Low Risk", "Medium Risk", "High Risk")
)

12 Model Performance

12.1 Concordance Index (C-index)

# Calculate C-index for different models
c_index_results <- data.frame(
  Model = c(
    "Treatment only",
    "Treatment + Cell Type",
    "Treatment + Cell Type + Karno",
    "Full Model"
  ),
  C_Index = c(
    summary(cox_m1)$concordance[1],
    summary(cox_m2)$concordance[1],
    summary(cox_m3)$concordance[1],
    summary(cox_mv)$concordance[1]
  )
)

c_index_results %>%
  arrange(desc(C_Index)) %>%
  kable(caption = "C-Index for Different Models", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

C-Index for Different Models
Model	C_Index
Treatment + Cell Type + Karno	0.737
Full Model	0.736
Treatment + Cell Type	0.619
Treatment only	0.525

13 Summary and Conclusions

13.1 Key Findings

# Get median survival value
median_val <- survfit(Surv(time, status) ~ 1, data = df)

# Create summary of key results
summary_df <- data.frame(
  Analysis = c(
    "Median Survival",
    "Treatment Effect (HR)",
    "Cell Type Effect",
    "Karno Score Effect (per unit)",
    "Age Effect (per year)",
    "Best Model by AIC",
    "Proportional Hazards Assumption",
    "C-Index (Full Model)"
  ),
  Result = c(
    paste0(median_val$median, " days"),
    sprintf("%.2f (p = %.3f)", 
            exp(coef(cox_mv)["trtTest"]), 
            summary(cox_mv)$coefficients["trtTest", "Pr(>|z|)"]),
    "Significant (p < 0.001)",
    sprintf("%.3f (p < 0.001)", exp(coef(cox_mv)["karno"])),
    sprintf("%.3f (p = %.3f)", 
            exp(coef(cox_mv)["age"]),
            summary(cox_mv)$coefficients["age", "Pr(>|z|)"]),
    "Full Model",
    ifelse(ph_test$table["GLOBAL", "p"] > 0.05, "Met", "Review Required"),
    sprintf("%.3f", summary(cox_mv)$concordance[1])
  )
)

summary_df %>%
  kable(caption = "Summary of Key Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Summary of Key Results
Analysis	Result
Median Survival	days
Treatment Effect (HR)	1.34 (p = 0.156)
Cell Type Effect	Significant (p < 0.001)
Karno Score Effect (per unit)	0.968 (p < 0.001)
Age Effect (per year)	0.991 (p = 0.349)
Best Model by AIC	Full Model
Proportional Hazards Assumption	Review Required
C-Index (Full Model)	0.736

13.2 Session Information

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Toronto
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.51       flexsurv_2.3.2   survMisc_0.5.6  
##  [5] gridExtra_2.3    ggsurvfit_1.2.0  gtsummary_2.5.0  tidyr_1.3.2     
##  [9] dplyr_1.1.4      survminer_0.5.1  ggpubr_0.6.2     ggplot2_4.0.1   
## [13] survival_3.8-3  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        broom.helpers_1.22.0
##  [7] labelled_2.16.0      digest_0.6.39        lifecycle_1.0.4     
## [10] statmod_1.5.1        magrittr_2.0.4       compiler_4.5.2      
## [13] rlang_1.1.6          sass_0.4.10          tools_4.5.2         
## [16] yaml_2.3.12          gt_1.2.0             data.table_1.18.0   
## [19] ggsignif_0.6.4       labeling_0.4.3       xml2_1.5.1          
## [22] RColorBrewer_1.1-3   abind_1.4-8          withr_3.0.2         
## [25] purrr_1.2.0          numDeriv_2016.8-1.1  grid_4.5.2          
## [28] mstate_0.3.3         xtable_1.8-4         scales_1.4.0        
## [31] dichromat_2.0-0.1    cli_3.6.5            mvtnorm_1.3-3       
## [34] rmarkdown_2.30       generics_0.1.4       otel_0.2.0          
## [37] rstudioapi_0.17.1    km.ci_0.5-6          commonmark_2.0.0    
## [40] cachem_1.1.0         stringr_1.6.0        splines_4.5.2       
## [43] muhaz_1.2.6.4        vctrs_0.6.5          Matrix_1.7-4        
## [46] jsonlite_2.0.0       carData_3.0-5        car_3.1-3           
## [49] litedown_0.9         hms_1.1.4            rstatix_0.7.3       
## [52] Formula_1.2-5        systemfonts_1.3.1    jquerylib_0.1.4     
## [55] glue_1.8.0           cowplot_1.2.0        ggtext_0.1.2        
## [58] stringi_1.8.7        gtable_0.3.6         quadprog_1.5-8      
## [61] tibble_3.3.0         pillar_1.11.1        htmltools_0.5.9     
## [64] deSolve_1.40         R6_2.6.1             KMsurv_0.1-6        
## [67] textshaping_1.0.4    evaluate_1.0.5       lattice_0.22-7      
## [70] haven_2.5.5          markdown_2.0         backports_1.5.0     
## [73] cards_0.7.1          gridtext_0.1.5       broom_1.0.11        
## [76] bslib_0.9.0          cardx_0.3.1          Rcpp_1.1.0          
## [79] svglite_2.2.2        xfun_0.55            forcats_1.0.1       
## [82] fs_1.6.6             zoo_1.8-15           pkgconfig_2.0.3

End of Analysis

Comprehensive Survival Analysis in R

BG

2026-01-03