1 Introduction

This document provides a comprehensive survival analysis workflow using the veteran dataset from the survival package. The dataset contains data from a randomized trial of two treatment regimens for lung cancer.

2 Load Required Packages

# Install packages if needed
required_packages <- c(
  "survival", "survminer", "ggplot2", "dplyr", 
  "tidyr", "gtsummary", "ggsurvfit", "gridExtra",
  "survMisc", "flexsurv", "knitr", "kableExtra"
)

# Install missing packages
installed <- installed.packages()[, "Package"]
to_install <- required_packages[!required_packages %in% installed]
if(length(to_install) > 0) {
  install.packages(to_install)
}

# Load packages
library(survival)
library(survminer)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gtsummary)
library(ggsurvfit)
library(gridExtra)
library(survMisc)
library(flexsurv)
library(knitr)
library(kableExtra)

3 Data Loading and Exploration

3.1 Load the Veteran Dataset

# Load veteran dataset
data(veteran)

# Create a working copy
df <- veteran

# Display structure
str(df)
## 'data.frame':    137 obs. of  8 variables:
##  $ trt     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ celltype: Factor w/ 4 levels "squamous","smallcell",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time    : num  72 411 228 126 118 10 82 110 314 100 ...
##  $ status  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ karno   : num  60 70 60 60 70 20 40 80 50 70 ...
##  $ diagtime: num  7 5 3 9 11 5 10 29 18 6 ...
##  $ age     : num  69 64 38 63 65 49 69 68 43 70 ...
##  $ prior   : num  0 10 0 10 10 0 10 0 0 0 ...
# View first few rows
head(df) %>%
  kable(caption = "First 6 rows of veteran dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
First 6 rows of veteran dataset
trt celltype time status karno diagtime age prior
1 squamous 72 1 60 7 69 0
1 squamous 411 1 70 5 64 10
1 squamous 228 1 60 3 38 0
1 squamous 126 1 60 9 63 10
1 squamous 118 1 70 11 65 10
1 squamous 10 1 20 5 49 0

3.2 Data Description

The veteran dataset contains the following variables:

  • trt: Treatment (1=standard, 2=test)
  • celltype: Cell type (squamous, smallcell, adeno, large)
  • time: Survival time in days
  • status: Status (0=censored, 1=dead)
  • karno: Karnofsky performance score (0-100)
  • diagtime: Months from diagnosis to randomization
  • age: Age in years
  • prior: Prior therapy (0=no, 10=yes)

3.3 Data Preprocessing

# Convert variables to factors
df <- df %>%
  mutate(
    trt = factor(trt, levels = c(1, 2), 
                 labels = c("Standard", "Test")),
    celltype = factor(celltype),
    prior = factor(prior, levels = c(0, 10), 
                   labels = c("No", "Yes"))
  )

# Summary statistics
summary(df)
##        trt          celltype       time           status           karno      
##  Standard:69   squamous :35   Min.   :  1.0   Min.   :0.0000   Min.   :10.00  
##  Test    :68   smallcell:48   1st Qu.: 25.0   1st Qu.:1.0000   1st Qu.:40.00  
##                adeno    :27   Median : 80.0   Median :1.0000   Median :60.00  
##                large    :27   Mean   :121.6   Mean   :0.9343   Mean   :58.57  
##                               3rd Qu.:144.0   3rd Qu.:1.0000   3rd Qu.:75.00  
##                               Max.   :999.0   Max.   :1.0000   Max.   :99.00  
##     diagtime           age        prior   
##  Min.   : 1.000   Min.   :34.00   No :97  
##  1st Qu.: 3.000   1st Qu.:51.00   Yes:40  
##  Median : 5.000   Median :62.00           
##  Mean   : 8.774   Mean   :58.31           
##  3rd Qu.:11.000   3rd Qu.:66.00           
##  Max.   :87.000   Max.   :81.00

3.4 Descriptive Statistics

# Create summary table
df %>%
  select(time, status, age, karno, diagtime) %>%
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      time ~ "Survival Time (days)",
      status ~ "Death",
      age ~ "Age (years)",
      karno ~ "Karnofsky Score",
      diagtime ~ "Diagnosis Time (months)"
    )
  ) %>%
  add_n() %>%
  bold_labels()
Characteristic N N = 1371
Survival Time (days) 137 122 (158)
Death 137 128 (93%)
Age (years) 137 58 (11)
Karnofsky Score 137 59 (20)
Diagnosis Time (months) 137 9 (11)
1 Mean (SD); n (%)

4 Kaplan-Meier Survival Analysis

4.1 Overall Survival Curve

# Fit overall survival
fit_overall <- survfit(Surv(time, status) ~ 1, data = df)

# Summary
summary(fit_overall, times = c(30, 60, 90, 180, 365))
## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    30     97      41    0.700  0.0392       0.6277        0.782
##    60     73      22    0.538  0.0427       0.4607        0.629
##    90     62      10    0.464  0.0428       0.3873        0.556
##   180     27      30    0.222  0.0369       0.1607        0.308
##   365     10      15    0.090  0.0265       0.0506        0.160
# Plot using survminer
ggsurvplot(
  fit_overall,
  data = df,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.25,
  ggtheme = theme_minimal(),
  palette = "blue",
  title = "Overall Kaplan-Meier Survival Curve",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend = "none"
)

4.2 Median Survival Time

# Calculate median survival
median_surv <- survfit(Surv(time, status) ~ 1, data = df)
print(median_surv)
## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 137    128     80      52     105
# Use gtsummary for clean table
survfit(Surv(time, status) ~ 1, data = df) %>%
  tbl_survfit(
    probs = 0.5,
    label_header = "**Median Survival (95% CI)**"
  )
Characteristic Median Survival (95% CI)
Overall 80 (52, 105)

4.3 Survival by Treatment

# Fit survival by treatment
fit_trt <- survfit(Surv(time, status) ~ trt, data = df)

# Plot
ggsurvplot(
  fit_trt,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.3,
  ggtheme = theme_minimal(),
  palette = c("#E7B800", "#2E9FDF"),
  title = "Survival by Treatment Group",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend.title = "Treatment",
  legend.labs = c("Standard", "Test")
)

4.4 Survival by Cell Type

# Fit survival by cell type
fit_cell <- survfit(Surv(time, status) ~ celltype, data = df)

# Plot
ggsurvplot(
  fit_cell,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.3,
  ggtheme = theme_minimal(),
  title = "Survival by Cell Type",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend.title = "Cell Type"
)

4.5 Log-Rank Test

# Log-rank test for treatment
logrank_trt <- survdiff(Surv(time, status) ~ trt, data = df)
print(logrank_trt)
## Call:
## survdiff(formula = Surv(time, status) ~ trt, data = df)
## 
##               N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=Standard 69       64     64.5   0.00388   0.00823
## trt=Test     68       64     63.5   0.00394   0.00823
## 
##  Chisq= 0  on 1 degrees of freedom, p= 0.9
# Log-rank test for cell type
logrank_cell <- survdiff(Surv(time, status) ~ celltype, data = df)
print(logrank_cell)
## Call:
## survdiff(formula = Surv(time, status) ~ celltype, data = df)
## 
##                     N Observed Expected (O-E)^2/E (O-E)^2/V
## celltype=squamous  35       31     47.7      5.82     10.53
## celltype=smallcell 48       45     30.1      7.37     10.20
## celltype=adeno     27       26     15.7      6.77      8.19
## celltype=large     27       26     34.5      2.12      3.02
## 
##  Chisq= 25.4  on 3 degrees of freedom, p= 1e-05
# Log-rank test for prior therapy
logrank_prior <- survdiff(Surv(time, status) ~ prior, data = df)
print(logrank_prior)
## Call:
## survdiff(formula = Surv(time, status) ~ prior, data = df)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## prior=No  97       91     87.4     0.150     0.501
## prior=Yes 40       37     40.6     0.323     0.501
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5

5 Cumulative Hazard Function

5.1 Overall Cumulative Hazard

# Calculate cumulative hazard
fit_overall <- survfit(Surv(time, status) ~ 1, data = df)

# Plot cumulative hazard
ggsurvplot(
  fit_overall,
  data = df,
  fun = "cumhaz",
  conf.int = TRUE,
  ggtheme = theme_minimal(),
  palette = "red",
  title = "Cumulative Hazard Function",
  xlab = "Time (days)",
  ylab = "Cumulative Hazard",
  break.time.by = 100
)

5.2 Cumulative Hazard by Treatment

# Plot cumulative hazard by treatment
ggsurvplot(
  fit_trt,
  data = df,
  fun = "cumhaz",
  conf.int = TRUE,
  ggtheme = theme_minimal(),
  palette = c("#E7B800", "#2E9FDF"),
  title = "Cumulative Hazard by Treatment",
  xlab = "Time (days)",
  ylab = "Cumulative Hazard",
  break.time.by = 100,
  legend.title = "Treatment"
)

5.3 Cumulative Hazard by Cell Type

# Plot cumulative hazard by cell type
ggsurvplot(
  fit_cell,
  data = df,
  fun = "cumhaz",
  conf.int = TRUE,
  ggtheme = theme_minimal(),
  title = "Cumulative Hazard by Cell Type",
  xlab = "Time (days)",
  ylab = "Cumulative Hazard",
  break.time.by = 100,
  legend.title = "Cell Type"
)

6 Cox Proportional Hazards Regression

6.1 Univariate Cox Regression

# Univariate Cox models for each variable
variables <- c("trt", "celltype", "karno", "diagtime", "age", "prior")

# Create table of univariate results
tbl_uvregression(
  df %>% select(time, status, all_of(variables)),
  method = coxph,
  y = Surv(time, status),
  exponentiate = TRUE,
  pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
  bold_p() %>%
  bold_labels()
Characteristic N HR 95% CI p-value
trt 137


    Standard
— —
    Test
1.02 0.71, 1.45 0.922
celltype 137


    squamous
— —
    smallcell
2.72 1.66, 4.47 <0.001
    adeno
3.15 1.77, 5.59 <0.001
    large
1.26 0.73, 2.17 0.407
karno 137 0.97 0.96, 0.98 <0.001
diagtime 137 1.01 0.99, 1.03 0.311
age 137 1.01 0.99, 1.03 0.433
prior 137


    No
— —
    Yes
0.87 0.59, 1.28 0.476
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio

6.2 Multivariate Cox Regression

6.2.1 Model 1: All Variables

# Fit multivariate Cox model
cox_mv <- coxph(
  Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
  data = df
)

# Summary
summary(cox_mv)
## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df)
## 
##   n= 137, number of events= 128 
## 
##                         coef  exp(coef)   se(coef)      z Pr(>|z|)    
## trtTest            2.946e-01  1.343e+00  2.075e-01  1.419  0.15577    
## celltypesmallcell  8.616e-01  2.367e+00  2.753e-01  3.130  0.00175 ** 
## celltypeadeno      1.196e+00  3.307e+00  3.009e-01  3.975 7.05e-05 ***
## celltypelarge      4.013e-01  1.494e+00  2.827e-01  1.420  0.15574    
## karno             -3.282e-02  9.677e-01  5.508e-03 -5.958 2.55e-09 ***
## diagtime           8.132e-05  1.000e+00  9.136e-03  0.009  0.99290    
## age               -8.706e-03  9.913e-01  9.300e-03 -0.936  0.34920    
## priorYes           7.159e-02  1.074e+00  2.323e-01  0.308  0.75794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## trtTest              1.3426     0.7448    0.8939    2.0166
## celltypesmallcell    2.3669     0.4225    1.3799    4.0597
## celltypeadeno        3.3071     0.3024    1.8336    5.9647
## celltypelarge        1.4938     0.6695    0.8583    2.5996
## karno                0.9677     1.0334    0.9573    0.9782
## diagtime             1.0001     0.9999    0.9823    1.0182
## age                  0.9913     1.0087    0.9734    1.0096
## priorYes             1.0742     0.9309    0.6813    1.6937
## 
## Concordance= 0.736  (se = 0.021 )
## Likelihood ratio test= 62.1  on 8 df,   p=2e-10
## Wald test            = 62.37  on 8 df,   p=2e-10
## Score (logrank) test = 66.74  on 8 df,   p=2e-11
# Create formatted table
tbl_regression(
  cox_mv,
  exponentiate = TRUE,
  pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
  bold_p() %>%
  bold_labels()
Characteristic HR 95% CI p-value
trt


    Standard — —
    Test 1.34 0.89, 2.02 0.156
celltype


    squamous — —
    smallcell 2.37 1.38, 4.06 0.002
    adeno 3.31 1.83, 5.96 <0.001
    large 1.49 0.86, 2.60 0.156
karno 0.97 0.96, 0.98 <0.001
diagtime 1.00 0.98, 1.02 0.993
age 0.99 0.97, 1.01 0.349
prior


    No — —
    Yes 1.07 0.68, 1.69 0.758
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio

6.2.2 Model Comparison with AIC

# Fit different models
cox_m1 <- coxph(Surv(time, status) ~ trt, data = df)
cox_m2 <- coxph(Surv(time, status) ~ trt + celltype, data = df)
cox_m3 <- coxph(Surv(time, status) ~ trt + celltype + karno, data = df)
cox_m4 <- cox_mv  # Full model

# Compare AIC
data.frame(
  Model = c("Treatment only", 
            "Treatment + Cell Type", 
            "Treatment + Cell Type + Karno",
            "Full Model"),
  AIC = c(AIC(cox_m1), AIC(cox_m2), AIC(cox_m3), AIC(cox_m4)),
  Variables = c(1, 2, 3, 7)
) %>%
  arrange(AIC) %>%
  kable(caption = "Model Comparison using AIC") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Model Comparison using AIC
Model AIC Variables
Treatment + Cell Type + Karno 959.8290 3
Full Model 964.7942 7
Treatment + Cell Type 993.0407 2
Treatment only 1012.8885 1

6.3 Forest Plot

# Forest plot for multivariate model
ggforest(
  cox_mv,
  data = df,
  main = "Hazard Ratios with 95% Confidence Intervals",
  fontsize = 0.8
)

7 Testing Proportional Hazards Assumption

7.1 Schoenfeld Residuals Test

# Test proportional hazards assumption
ph_test <- cox.zph(cox_mv)
print(ph_test)
##            chisq df       p
## trt       0.2644  1 0.60712
## celltype 15.2274  3 0.00163
## karno    12.9352  1 0.00032
## diagtime  0.0129  1 0.90961
## age       1.8288  1 0.17627
## prior     2.1656  1 0.14113
## GLOBAL   34.5525  8 3.2e-05
# Interpretation table
data.frame(
  Variable = rownames(ph_test$table),
  ChiSq = ph_test$table[, "chisq"],
  P_value = ph_test$table[, "p"],
  Conclusion = ifelse(
    ph_test$table[, "p"] > 0.05,
    "Assumption Met",
    "Assumption Violated"
  )
) %>%
  kable(caption = "Proportional Hazards Assumption Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(which(ph_test$table[, "p"] <= 0.05), 
           background = "#ffcccc")
Proportional Hazards Assumption Test
Variable ChiSq P_value Conclusion
trt trt 0.2643888 0.6071209 Assumption Met
celltype celltype 15.2273639 0.0016323 Assumption Violated
karno karno 12.9351978 0.0003225 Assumption Violated
diagtime diagtime 0.0128894 0.9096093 Assumption Met
age age 1.8288266 0.1762662 Assumption Met
prior prior 2.1655663 0.1411326 Assumption Met
GLOBAL GLOBAL 34.5524911 0.0000323 Assumption Violated

7.2 Schoenfeld Residuals Plots

# Plot Schoenfeld residuals
par(mfrow = c(3, 3))
plot(ph_test)
par(mfrow = c(1, 1))

7.3 Alternative: Log-Log Survival Plots

# Log-log survival plot for treatment
ggsurvplot(
  fit_trt,
  data = df,
  fun = "cloglog",
  ggtheme = theme_minimal(),
  palette = c("#E7B800", "#2E9FDF"),
  title = "Log-Log Survival Plot by Treatment",
  xlab = "Log(Time)",
  ylab = "Log(-Log(Survival))",
  legend.title = "Treatment"
)

# Log-log survival plot for cell type
ggsurvplot(
  fit_cell,
  data = df,
  fun = "cloglog",
  ggtheme = theme_minimal(),
  title = "Log-Log Survival Plot by Cell Type",
  xlab = "Log(Time)",
  ylab = "Log(-Log(Survival))",
  legend.title = "Cell Type"
)

8 Model Diagnostics

8.1 Cox-Snell Residuals

# Calculate Cox-Snell residuals
cox_snell <- df$status - residuals(cox_mv, type = "martingale")

# Fit survival to Cox-Snell residuals
cs_fit <- survfit(Surv(cox_snell, df$status) ~ 1)

# Plot
plot(
  cs_fit$time, 
  -log(cs_fit$surv),
  type = "s",
  xlab = "Cox-Snell Residuals",
  ylab = "Cumulative Hazard",
  main = "Cox-Snell Residual Plot\n(Should follow 45-degree line)"
)
abline(0, 1, col = "red", lty = 2)
legend("bottomright", 
       legend = "45-degree reference line", 
       col = "red", 
       lty = 2)

8.2 Martingale Residuals

# Calculate martingale residuals
mart_resid <- residuals(cox_mv, type = "martingale")

# Plot against continuous variables
par(mfrow = c(2, 2))

plot(df$karno, mart_resid,
     xlab = "Karnofsky Score",
     ylab = "Martingale Residuals",
     main = "Martingale Residuals vs Karno")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$karno, mart_resid), col = "blue", lwd = 2)

plot(df$age, mart_resid,
     xlab = "Age",
     ylab = "Martingale Residuals",
     main = "Martingale Residuals vs Age")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$age, mart_resid), col = "blue", lwd = 2)

plot(df$diagtime, mart_resid,
     xlab = "Diagnosis Time",
     ylab = "Martingale Residuals",
     main = "Martingale Residuals vs Diagtime")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$diagtime, mart_resid), col = "blue", lwd = 2)

par(mfrow = c(1, 1))

8.3 Deviance Residuals

# Calculate deviance residuals
dev_resid <- residuals(cox_mv, type = "deviance")

# Plot deviance residuals
plot(
  predict(cox_mv),
  dev_resid,
  xlab = "Linear Predictor",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs Linear Predictor"
)
abline(h = 0, col = "red", lty = 2)
lines(lowess(predict(cox_mv), dev_resid), col = "blue", lwd = 2)

# Identify potential outliers
outliers <- which(abs(dev_resid) > 3)
if(length(outliers) > 0) {
  text(
    predict(cox_mv)[outliers],
    dev_resid[outliers],
    labels = outliers,
    pos = 3,
    col = "red"
  )
}

8.4 Influential Observations (DFBETAs)

# Calculate DFBETAs
dfbeta_vals <- residuals(cox_mv, type = "dfbeta")

# Check column names
colnames(dfbeta_vals)
## NULL
# Plot for key variables (use actual column names)
par(mfrow = c(2, 2))

# Treatment effect
trt_col <- grep("trt", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(trt_col)) {
  plot(dfbeta_vals[, trt_col],
       ylab = "DFBETAs for Treatment",
       main = "Influence on Treatment Effect",
       type = "h")
  abline(h = 0, col = "red", lty = 2)
}

# Karno effect
karno_col <- grep("karno", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(karno_col)) {
  plot(dfbeta_vals[, karno_col],
       ylab = "DFBETAs for Karno",
       main = "Influence on Karno Effect",
       type = "h")
  abline(h = 0, col = "red", lty = 2)
}

# Age effect
age_col <- grep("age", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(age_col)) {
  plot(dfbeta_vals[, age_col],
       ylab = "DFBETAs for Age",
       main = "Influence on Age Effect",
       type = "h")
  abline(h = 0, col = "red", lty = 2)
}

par(mfrow = c(1, 1))

9 Stratified Cox Model

If proportional hazards assumption is violated for a variable, we can stratify by it.

# Example: Stratify by cell type if assumption violated
cox_strat <- coxph(
  Surv(time, status) ~ trt + strata(celltype) + karno + diagtime + age + prior,
  data = df
)

summary(cox_strat)
## Call:
## coxph(formula = Surv(time, status) ~ trt + strata(celltype) + 
##     karno + diagtime + age + prior, data = df)
## 
##   n= 137, number of events= 128 
## 
##               coef exp(coef)  se(coef)      z Pr(>|z|)    
## trtTest   0.285902  1.330962  0.210009  1.361    0.173    
## karno    -0.038262  0.962461  0.005932 -6.450 1.12e-10 ***
## diagtime -0.003439  0.996567  0.009075 -0.379    0.705    
## age      -0.011821  0.988249  0.009846 -1.201    0.230    
## priorYes  0.169069  1.184201  0.235667  0.717    0.473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## trtTest     1.3310     0.7513    0.8819    2.0087
## karno       0.9625     1.0390    0.9513    0.9737
## diagtime    0.9966     1.0034    0.9790    1.0145
## age         0.9882     1.0119    0.9694    1.0075
## priorYes    1.1842     0.8445    0.7461    1.8794
## 
## Concordance= 0.704  (se = 0.027 )
## Likelihood ratio test= 44.27  on 5 df,   p=2e-08
## Wald test            = 43.63  on 5 df,   p=3e-08
## Score (logrank) test = 47.01  on 5 df,   p=6e-09
# Compare with unstratified model
data.frame(
  Model = c("Unstratified", "Stratified by Cell Type"),
  AIC = c(AIC(cox_mv), AIC(cox_strat)),
  Concordance = c(
    summary(cox_mv)$concordance[1],
    summary(cox_strat)$concordance[1]
  )
) %>%
  kable(caption = "Comparison of Stratified vs Unstratified Models") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Comparison of Stratified vs Unstratified Models
Model AIC Concordance
Unstratified 964.7942 0.7360291
Stratified by Cell Type 643.2026 0.7038814

10 Parametric Survival Models

10.1 Exponential Model

# Fit exponential model
exp_model <- survreg(
  Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
  data = df,
  dist = "exponential"
)

summary(exp_model)
## 
## Call:
## survreg(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df, dist = "exponential")
##                       Value Std. Error     z       p
## (Intercept)        3.188611   0.704033  4.53 5.9e-06
## trtTest           -0.219565   0.198634 -1.11  0.2690
## celltypesmallcell -0.820245   0.262111 -3.13  0.0018
## celltypeadeno     -1.113121   0.275825 -4.04 5.4e-05
## celltypelarge     -0.377220   0.272626 -1.38  0.1665
## karno              0.030624   0.005108  6.00 2.0e-09
## diagtime          -0.000297   0.008970 -0.03  0.9736
## age                0.006108   0.009161  0.67  0.5049
## priorYes          -0.049482   0.226871 -0.22  0.8273
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -716.2   Loglik(intercept only)= -751.2
##  Chisq= 70.12 on 8 degrees of freedom, p= 4.6e-12 
## Number of Newton-Raphson Iterations: 5 
## n= 137

10.2 Weibull Model

# Fit Weibull model
weibull_model <- survreg(
  Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
  data = df,
  dist = "weibull"
)

summary(weibull_model)
## 
## Call:
## survreg(formula = Surv(time, status) ~ trt + celltype + karno + 
##     diagtime + age + prior, data = df, dist = "weibull")
##                       Value Std. Error     z       p
## (Intercept)        3.262014   0.662531  4.92 8.5e-07
## trtTest           -0.228523   0.186844 -1.22  0.2213
## celltypesmallcell -0.826185   0.246312 -3.35  0.0008
## celltypeadeno     -1.132725   0.257598 -4.40 1.1e-05
## celltypelarge     -0.397681   0.254749 -1.56  0.1185
## karno              0.030068   0.004828  6.23 4.7e-10
## diagtime          -0.000469   0.008361 -0.06  0.9553
## age                0.006099   0.008553  0.71  0.4758
## priorYes          -0.043898   0.212279 -0.21  0.8362
## Log(scale)        -0.074599   0.066311 -1.12  0.2606
## 
## Scale= 0.928 
## 
## Weibull distribution
## Loglik(model)= -715.6   Loglik(intercept only)= -748.1
##  Chisq= 65.08 on 8 degrees of freedom, p= 4.7e-11 
## Number of Newton-Raphson Iterations: 6 
## n= 137
# Compare AIC
data.frame(
  Model = c("Exponential", "Weibull", "Cox PH"),
  AIC = c(AIC(exp_model), AIC(weibull_model), AIC(cox_mv))
) %>%
  arrange(AIC) %>%
  kable(caption = "Comparison of Parametric Models") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Comparison of Parametric Models
Model AIC
Cox PH 964.7942
Exponential 1450.3182
Weibull 1451.1027

11 Risk Score and Predicted Survival

11.1 Calculate Risk Score

# Calculate risk score (linear predictor)
df$risk_score <- predict(cox_mv, type = "lp")

# Categorize into risk groups
df$risk_group <- cut(
  df$risk_score,
  breaks = quantile(df$risk_score, probs = c(0, 0.33, 0.67, 1)),
  labels = c("Low", "Medium", "High"),
  include.lowest = TRUE
)

# Summary by risk group
table(df$risk_group) %>%
  kable(col.names = c("Risk Group", "N"), 
        caption = "Distribution of Risk Groups") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Distribution of Risk Groups
Risk Group N
Low 45
Medium 47
High 45

11.2 Survival by Risk Group

# Fit survival by risk group
fit_risk <- survfit(Surv(time, status) ~ risk_group, data = df)

# Plot
ggsurvplot(
  fit_risk,
  data = df,
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  risk.table.height = 0.3,
  ggtheme = theme_minimal(),
  palette = c("green", "orange", "red"),
  title = "Survival by Predicted Risk Group",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  break.time.by = 100,
  legend.title = "Risk Group",
  legend.labs = c("Low Risk", "Medium Risk", "High Risk")
)

12 Model Performance

12.1 Concordance Index (C-index)

# Calculate C-index for different models
c_index_results <- data.frame(
  Model = c(
    "Treatment only",
    "Treatment + Cell Type",
    "Treatment + Cell Type + Karno",
    "Full Model"
  ),
  C_Index = c(
    summary(cox_m1)$concordance[1],
    summary(cox_m2)$concordance[1],
    summary(cox_m3)$concordance[1],
    summary(cox_mv)$concordance[1]
  )
)

c_index_results %>%
  arrange(desc(C_Index)) %>%
  kable(caption = "C-Index for Different Models", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
C-Index for Different Models
Model C_Index
Treatment + Cell Type + Karno 0.737
Full Model 0.736
Treatment + Cell Type 0.619
Treatment only 0.525

13 Summary and Conclusions

13.1 Key Findings

# Get median survival value
median_val <- survfit(Surv(time, status) ~ 1, data = df)

# Create summary of key results
summary_df <- data.frame(
  Analysis = c(
    "Median Survival",
    "Treatment Effect (HR)",
    "Cell Type Effect",
    "Karno Score Effect (per unit)",
    "Age Effect (per year)",
    "Best Model by AIC",
    "Proportional Hazards Assumption",
    "C-Index (Full Model)"
  ),
  Result = c(
    paste0(median_val$median, " days"),
    sprintf("%.2f (p = %.3f)", 
            exp(coef(cox_mv)["trtTest"]), 
            summary(cox_mv)$coefficients["trtTest", "Pr(>|z|)"]),
    "Significant (p < 0.001)",
    sprintf("%.3f (p < 0.001)", exp(coef(cox_mv)["karno"])),
    sprintf("%.3f (p = %.3f)", 
            exp(coef(cox_mv)["age"]),
            summary(cox_mv)$coefficients["age", "Pr(>|z|)"]),
    "Full Model",
    ifelse(ph_test$table["GLOBAL", "p"] > 0.05, "Met", "Review Required"),
    sprintf("%.3f", summary(cox_mv)$concordance[1])
  )
)

summary_df %>%
  kable(caption = "Summary of Key Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Summary of Key Results
Analysis Result
Median Survival days
Treatment Effect (HR) 1.34 (p = 0.156)
Cell Type Effect Significant (p < 0.001)
Karno Score Effect (per unit) 0.968 (p < 0.001)
Age Effect (per year) 0.991 (p = 0.349)
Best Model by AIC Full Model
Proportional Hazards Assumption Review Required
C-Index (Full Model) 0.736

13.2 Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Toronto
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.51       flexsurv_2.3.2   survMisc_0.5.6  
##  [5] gridExtra_2.3    ggsurvfit_1.2.0  gtsummary_2.5.0  tidyr_1.3.2     
##  [9] dplyr_1.1.4      survminer_0.5.1  ggpubr_0.6.2     ggplot2_4.0.1   
## [13] survival_3.8-3  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        broom.helpers_1.22.0
##  [7] labelled_2.16.0      digest_0.6.39        lifecycle_1.0.4     
## [10] statmod_1.5.1        magrittr_2.0.4       compiler_4.5.2      
## [13] rlang_1.1.6          sass_0.4.10          tools_4.5.2         
## [16] yaml_2.3.12          gt_1.2.0             data.table_1.18.0   
## [19] ggsignif_0.6.4       labeling_0.4.3       xml2_1.5.1          
## [22] RColorBrewer_1.1-3   abind_1.4-8          withr_3.0.2         
## [25] purrr_1.2.0          numDeriv_2016.8-1.1  grid_4.5.2          
## [28] mstate_0.3.3         xtable_1.8-4         scales_1.4.0        
## [31] dichromat_2.0-0.1    cli_3.6.5            mvtnorm_1.3-3       
## [34] rmarkdown_2.30       generics_0.1.4       otel_0.2.0          
## [37] rstudioapi_0.17.1    km.ci_0.5-6          commonmark_2.0.0    
## [40] cachem_1.1.0         stringr_1.6.0        splines_4.5.2       
## [43] muhaz_1.2.6.4        vctrs_0.6.5          Matrix_1.7-4        
## [46] jsonlite_2.0.0       carData_3.0-5        car_3.1-3           
## [49] litedown_0.9         hms_1.1.4            rstatix_0.7.3       
## [52] Formula_1.2-5        systemfonts_1.3.1    jquerylib_0.1.4     
## [55] glue_1.8.0           cowplot_1.2.0        ggtext_0.1.2        
## [58] stringi_1.8.7        gtable_0.3.6         quadprog_1.5-8      
## [61] tibble_3.3.0         pillar_1.11.1        htmltools_0.5.9     
## [64] deSolve_1.40         R6_2.6.1             KMsurv_0.1-6        
## [67] textshaping_1.0.4    evaluate_1.0.5       lattice_0.22-7      
## [70] haven_2.5.5          markdown_2.0         backports_1.5.0     
## [73] cards_0.7.1          gridtext_0.1.5       broom_1.0.11        
## [76] bslib_0.9.0          cardx_0.3.1          Rcpp_1.1.0          
## [79] svglite_2.2.2        xfun_0.55            forcats_1.0.1       
## [82] fs_1.6.6             zoo_1.8-15           pkgconfig_2.0.3

End of Analysis