Load Required
Packages
# Install packages if needed
required_packages <- c(
"survival", "survminer", "ggplot2", "dplyr",
"tidyr", "gtsummary", "ggsurvfit", "gridExtra",
"survMisc", "flexsurv", "knitr", "kableExtra"
)
# Install missing packages
installed <- installed.packages()[, "Package"]
to_install <- required_packages[!required_packages %in% installed]
if(length(to_install) > 0) {
install.packages(to_install)
}
# Load packages
library(survival)
library(survminer)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gtsummary)
library(ggsurvfit)
library(gridExtra)
library(survMisc)
library(flexsurv)
library(knitr)
library(kableExtra)
Data Loading and
Exploration
Load the Veteran
Dataset
# Load veteran dataset
data(veteran)
# Create a working copy
df <- veteran
# Display structure
str(df)
## 'data.frame': 137 obs. of 8 variables:
## $ trt : num 1 1 1 1 1 1 1 1 1 1 ...
## $ celltype: Factor w/ 4 levels "squamous","smallcell",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 72 411 228 126 118 10 82 110 314 100 ...
## $ status : num 1 1 1 1 1 1 1 1 1 0 ...
## $ karno : num 60 70 60 60 70 20 40 80 50 70 ...
## $ diagtime: num 7 5 3 9 11 5 10 29 18 6 ...
## $ age : num 69 64 38 63 65 49 69 68 43 70 ...
## $ prior : num 0 10 0 10 10 0 10 0 0 0 ...
# View first few rows
head(df) %>%
kable(caption = "First 6 rows of veteran dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
First 6 rows of veteran dataset
|
trt
|
celltype
|
time
|
status
|
karno
|
diagtime
|
age
|
prior
|
|
1
|
squamous
|
72
|
1
|
60
|
7
|
69
|
0
|
|
1
|
squamous
|
411
|
1
|
70
|
5
|
64
|
10
|
|
1
|
squamous
|
228
|
1
|
60
|
3
|
38
|
0
|
|
1
|
squamous
|
126
|
1
|
60
|
9
|
63
|
10
|
|
1
|
squamous
|
118
|
1
|
70
|
11
|
65
|
10
|
|
1
|
squamous
|
10
|
1
|
20
|
5
|
49
|
0
|
Data Description
The veteran dataset contains the following
variables:
- trt: Treatment (1=standard, 2=test)
- celltype: Cell type (squamous, smallcell, adeno,
large)
- time: Survival time in days
- status: Status (0=censored, 1=dead)
- karno: Karnofsky performance score (0-100)
- diagtime: Months from diagnosis to
randomization
- age: Age in years
- prior: Prior therapy (0=no, 10=yes)
Data
Preprocessing
# Convert variables to factors
df <- df %>%
mutate(
trt = factor(trt, levels = c(1, 2),
labels = c("Standard", "Test")),
celltype = factor(celltype),
prior = factor(prior, levels = c(0, 10),
labels = c("No", "Yes"))
)
# Summary statistics
summary(df)
## trt celltype time status karno
## Standard:69 squamous :35 Min. : 1.0 Min. :0.0000 Min. :10.00
## Test :68 smallcell:48 1st Qu.: 25.0 1st Qu.:1.0000 1st Qu.:40.00
## adeno :27 Median : 80.0 Median :1.0000 Median :60.00
## large :27 Mean :121.6 Mean :0.9343 Mean :58.57
## 3rd Qu.:144.0 3rd Qu.:1.0000 3rd Qu.:75.00
## Max. :999.0 Max. :1.0000 Max. :99.00
## diagtime age prior
## Min. : 1.000 Min. :34.00 No :97
## 1st Qu.: 3.000 1st Qu.:51.00 Yes:40
## Median : 5.000 Median :62.00
## Mean : 8.774 Mean :58.31
## 3rd Qu.:11.000 3rd Qu.:66.00
## Max. :87.000 Max. :81.00
Descriptive
Statistics
# Create summary table
df %>%
select(time, status, age, karno, diagtime) %>%
tbl_summary(
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
label = list(
time ~ "Survival Time (days)",
status ~ "Death",
age ~ "Age (years)",
karno ~ "Karnofsky Score",
diagtime ~ "Diagnosis Time (months)"
)
) %>%
add_n() %>%
bold_labels()
| Characteristic |
N |
N = 137 |
| Survival Time (days) |
137 |
122 (158) |
| Death |
137 |
128 (93%) |
| Age (years) |
137 |
58 (11) |
| Karnofsky Score |
137 |
59 (20) |
| Diagnosis Time (months) |
137 |
9 (11) |
Kaplan-Meier Survival
Analysis
Overall Survival
Curve
# Fit overall survival
fit_overall <- survfit(Surv(time, status) ~ 1, data = df)
# Summary
summary(fit_overall, times = c(30, 60, 90, 180, 365))
## Call: survfit(formula = Surv(time, status) ~ 1, data = df)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 30 97 41 0.700 0.0392 0.6277 0.782
## 60 73 22 0.538 0.0427 0.4607 0.629
## 90 62 10 0.464 0.0428 0.3873 0.556
## 180 27 30 0.222 0.0369 0.1607 0.308
## 365 10 15 0.090 0.0265 0.0506 0.160
# Plot using survminer
ggsurvplot(
fit_overall,
data = df,
conf.int = TRUE,
risk.table = TRUE,
risk.table.height = 0.25,
ggtheme = theme_minimal(),
palette = "blue",
title = "Overall Kaplan-Meier Survival Curve",
xlab = "Time (days)",
ylab = "Survival Probability",
break.time.by = 100,
legend = "none"
)

Survival by
Treatment
# Fit survival by treatment
fit_trt <- survfit(Surv(time, status) ~ trt, data = df)
# Plot
ggsurvplot(
fit_trt,
data = df,
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE,
risk.table.height = 0.3,
ggtheme = theme_minimal(),
palette = c("#E7B800", "#2E9FDF"),
title = "Survival by Treatment Group",
xlab = "Time (days)",
ylab = "Survival Probability",
break.time.by = 100,
legend.title = "Treatment",
legend.labs = c("Standard", "Test")
)

Survival by Cell
Type
# Fit survival by cell type
fit_cell <- survfit(Surv(time, status) ~ celltype, data = df)
# Plot
ggsurvplot(
fit_cell,
data = df,
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE,
risk.table.height = 0.3,
ggtheme = theme_minimal(),
title = "Survival by Cell Type",
xlab = "Time (days)",
ylab = "Survival Probability",
break.time.by = 100,
legend.title = "Cell Type"
)

Log-Rank Test
# Log-rank test for treatment
logrank_trt <- survdiff(Surv(time, status) ~ trt, data = df)
print(logrank_trt)
## Call:
## survdiff(formula = Surv(time, status) ~ trt, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=Standard 69 64 64.5 0.00388 0.00823
## trt=Test 68 64 63.5 0.00394 0.00823
##
## Chisq= 0 on 1 degrees of freedom, p= 0.9
# Log-rank test for cell type
logrank_cell <- survdiff(Surv(time, status) ~ celltype, data = df)
print(logrank_cell)
## Call:
## survdiff(formula = Surv(time, status) ~ celltype, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## celltype=squamous 35 31 47.7 5.82 10.53
## celltype=smallcell 48 45 30.1 7.37 10.20
## celltype=adeno 27 26 15.7 6.77 8.19
## celltype=large 27 26 34.5 2.12 3.02
##
## Chisq= 25.4 on 3 degrees of freedom, p= 1e-05
# Log-rank test for prior therapy
logrank_prior <- survdiff(Surv(time, status) ~ prior, data = df)
print(logrank_prior)
## Call:
## survdiff(formula = Surv(time, status) ~ prior, data = df)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## prior=No 97 91 87.4 0.150 0.501
## prior=Yes 40 37 40.6 0.323 0.501
##
## Chisq= 0.5 on 1 degrees of freedom, p= 0.5
Cumulative Hazard
Function
Overall Cumulative
Hazard
# Calculate cumulative hazard
fit_overall <- survfit(Surv(time, status) ~ 1, data = df)
# Plot cumulative hazard
ggsurvplot(
fit_overall,
data = df,
fun = "cumhaz",
conf.int = TRUE,
ggtheme = theme_minimal(),
palette = "red",
title = "Cumulative Hazard Function",
xlab = "Time (days)",
ylab = "Cumulative Hazard",
break.time.by = 100
)

Cumulative Hazard by
Treatment
# Plot cumulative hazard by treatment
ggsurvplot(
fit_trt,
data = df,
fun = "cumhaz",
conf.int = TRUE,
ggtheme = theme_minimal(),
palette = c("#E7B800", "#2E9FDF"),
title = "Cumulative Hazard by Treatment",
xlab = "Time (days)",
ylab = "Cumulative Hazard",
break.time.by = 100,
legend.title = "Treatment"
)

Cumulative Hazard by
Cell Type
# Plot cumulative hazard by cell type
ggsurvplot(
fit_cell,
data = df,
fun = "cumhaz",
conf.int = TRUE,
ggtheme = theme_minimal(),
title = "Cumulative Hazard by Cell Type",
xlab = "Time (days)",
ylab = "Cumulative Hazard",
break.time.by = 100,
legend.title = "Cell Type"
)

Cox Proportional
Hazards Regression
Univariate Cox
Regression
# Univariate Cox models for each variable
variables <- c("trt", "celltype", "karno", "diagtime", "age", "prior")
# Create table of univariate results
tbl_uvregression(
df %>% select(time, status, all_of(variables)),
method = coxph,
y = Surv(time, status),
exponentiate = TRUE,
pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
bold_p() %>%
bold_labels()
| Characteristic |
N |
HR |
95% CI |
p-value |
| trt |
137 |
|
|
|
| Â Â Â Â Standard |
|
— |
— |
|
| Â Â Â Â Test |
|
1.02 |
0.71, 1.45 |
0.922 |
| celltype |
137 |
|
|
|
| Â Â Â Â squamous |
|
— |
— |
|
| Â Â Â Â smallcell |
|
2.72 |
1.66, 4.47 |
<0.001 |
| Â Â Â Â adeno |
|
3.15 |
1.77, 5.59 |
<0.001 |
| Â Â Â Â large |
|
1.26 |
0.73, 2.17 |
0.407 |
| karno |
137 |
0.97 |
0.96, 0.98 |
<0.001 |
| diagtime |
137 |
1.01 |
0.99, 1.03 |
0.311 |
| age |
137 |
1.01 |
0.99, 1.03 |
0.433 |
| prior |
137 |
|
|
|
| Â Â Â Â No |
|
— |
— |
|
| Â Â Â Â Yes |
|
0.87 |
0.59, 1.28 |
0.476 |
| Abbreviations: CI = Confidence Interval, HR = Hazard Ratio |
Multivariate Cox
Regression
Model 1: All
Variables
# Fit multivariate Cox model
cox_mv <- coxph(
Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
data = df
)
# Summary
summary(cox_mv)
## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno +
## diagtime + age + prior, data = df)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## trtTest 2.946e-01 1.343e+00 2.075e-01 1.419 0.15577
## celltypesmallcell 8.616e-01 2.367e+00 2.753e-01 3.130 0.00175 **
## celltypeadeno 1.196e+00 3.307e+00 3.009e-01 3.975 7.05e-05 ***
## celltypelarge 4.013e-01 1.494e+00 2.827e-01 1.420 0.15574
## karno -3.282e-02 9.677e-01 5.508e-03 -5.958 2.55e-09 ***
## diagtime 8.132e-05 1.000e+00 9.136e-03 0.009 0.99290
## age -8.706e-03 9.913e-01 9.300e-03 -0.936 0.34920
## priorYes 7.159e-02 1.074e+00 2.323e-01 0.308 0.75794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## trtTest 1.3426 0.7448 0.8939 2.0166
## celltypesmallcell 2.3669 0.4225 1.3799 4.0597
## celltypeadeno 3.3071 0.3024 1.8336 5.9647
## celltypelarge 1.4938 0.6695 0.8583 2.5996
## karno 0.9677 1.0334 0.9573 0.9782
## diagtime 1.0001 0.9999 0.9823 1.0182
## age 0.9913 1.0087 0.9734 1.0096
## priorYes 1.0742 0.9309 0.6813 1.6937
##
## Concordance= 0.736 (se = 0.021 )
## Likelihood ratio test= 62.1 on 8 df, p=2e-10
## Wald test = 62.37 on 8 df, p=2e-10
## Score (logrank) test = 66.74 on 8 df, p=2e-11
# Create formatted table
tbl_regression(
cox_mv,
exponentiate = TRUE,
pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
bold_p() %>%
bold_labels()
| Characteristic |
HR |
95% CI |
p-value |
| trt |
|
|
|
| Â Â Â Â Standard |
— |
— |
|
| Â Â Â Â Test |
1.34 |
0.89, 2.02 |
0.156 |
| celltype |
|
|
|
| Â Â Â Â squamous |
— |
— |
|
| Â Â Â Â smallcell |
2.37 |
1.38, 4.06 |
0.002 |
| Â Â Â Â adeno |
3.31 |
1.83, 5.96 |
<0.001 |
| Â Â Â Â large |
1.49 |
0.86, 2.60 |
0.156 |
| karno |
0.97 |
0.96, 0.98 |
<0.001 |
| diagtime |
1.00 |
0.98, 1.02 |
0.993 |
| age |
0.99 |
0.97, 1.01 |
0.349 |
| prior |
|
|
|
| Â Â Â Â No |
— |
— |
|
| Â Â Â Â Yes |
1.07 |
0.68, 1.69 |
0.758 |
| Abbreviations: CI = Confidence Interval, HR = Hazard Ratio |
Model Comparison
with AIC
# Fit different models
cox_m1 <- coxph(Surv(time, status) ~ trt, data = df)
cox_m2 <- coxph(Surv(time, status) ~ trt + celltype, data = df)
cox_m3 <- coxph(Surv(time, status) ~ trt + celltype + karno, data = df)
cox_m4 <- cox_mv # Full model
# Compare AIC
data.frame(
Model = c("Treatment only",
"Treatment + Cell Type",
"Treatment + Cell Type + Karno",
"Full Model"),
AIC = c(AIC(cox_m1), AIC(cox_m2), AIC(cox_m3), AIC(cox_m4)),
Variables = c(1, 2, 3, 7)
) %>%
arrange(AIC) %>%
kable(caption = "Model Comparison using AIC") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Model Comparison using AIC
|
Model
|
AIC
|
Variables
|
|
Treatment + Cell Type + Karno
|
959.8290
|
3
|
|
Full Model
|
964.7942
|
7
|
|
Treatment + Cell Type
|
993.0407
|
2
|
|
Treatment only
|
1012.8885
|
1
|
Forest Plot
# Forest plot for multivariate model
ggforest(
cox_mv,
data = df,
main = "Hazard Ratios with 95% Confidence Intervals",
fontsize = 0.8
)

Testing Proportional
Hazards Assumption
Schoenfeld Residuals
Test
# Test proportional hazards assumption
ph_test <- cox.zph(cox_mv)
print(ph_test)
## chisq df p
## trt 0.2644 1 0.60712
## celltype 15.2274 3 0.00163
## karno 12.9352 1 0.00032
## diagtime 0.0129 1 0.90961
## age 1.8288 1 0.17627
## prior 2.1656 1 0.14113
## GLOBAL 34.5525 8 3.2e-05
# Interpretation table
data.frame(
Variable = rownames(ph_test$table),
ChiSq = ph_test$table[, "chisq"],
P_value = ph_test$table[, "p"],
Conclusion = ifelse(
ph_test$table[, "p"] > 0.05,
"Assumption Met",
"Assumption Violated"
)
) %>%
kable(caption = "Proportional Hazards Assumption Test") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(which(ph_test$table[, "p"] <= 0.05),
background = "#ffcccc")
Proportional Hazards Assumption Test
|
|
Variable
|
ChiSq
|
P_value
|
Conclusion
|
|
trt
|
trt
|
0.2643888
|
0.6071209
|
Assumption Met
|
|
celltype
|
celltype
|
15.2273639
|
0.0016323
|
Assumption Violated
|
|
karno
|
karno
|
12.9351978
|
0.0003225
|
Assumption Violated
|
|
diagtime
|
diagtime
|
0.0128894
|
0.9096093
|
Assumption Met
|
|
age
|
age
|
1.8288266
|
0.1762662
|
Assumption Met
|
|
prior
|
prior
|
2.1655663
|
0.1411326
|
Assumption Met
|
|
GLOBAL
|
GLOBAL
|
34.5524911
|
0.0000323
|
Assumption Violated
|
Schoenfeld Residuals
Plots
# Plot Schoenfeld residuals
par(mfrow = c(3, 3))
plot(ph_test)
par(mfrow = c(1, 1))

Alternative: Log-Log
Survival Plots
# Log-log survival plot for treatment
ggsurvplot(
fit_trt,
data = df,
fun = "cloglog",
ggtheme = theme_minimal(),
palette = c("#E7B800", "#2E9FDF"),
title = "Log-Log Survival Plot by Treatment",
xlab = "Log(Time)",
ylab = "Log(-Log(Survival))",
legend.title = "Treatment"
)

# Log-log survival plot for cell type
ggsurvplot(
fit_cell,
data = df,
fun = "cloglog",
ggtheme = theme_minimal(),
title = "Log-Log Survival Plot by Cell Type",
xlab = "Log(Time)",
ylab = "Log(-Log(Survival))",
legend.title = "Cell Type"
)

Model Diagnostics
Cox-Snell
Residuals
# Calculate Cox-Snell residuals
cox_snell <- df$status - residuals(cox_mv, type = "martingale")
# Fit survival to Cox-Snell residuals
cs_fit <- survfit(Surv(cox_snell, df$status) ~ 1)
# Plot
plot(
cs_fit$time,
-log(cs_fit$surv),
type = "s",
xlab = "Cox-Snell Residuals",
ylab = "Cumulative Hazard",
main = "Cox-Snell Residual Plot\n(Should follow 45-degree line)"
)
abline(0, 1, col = "red", lty = 2)
legend("bottomright",
legend = "45-degree reference line",
col = "red",
lty = 2)

Martingale
Residuals
# Calculate martingale residuals
mart_resid <- residuals(cox_mv, type = "martingale")
# Plot against continuous variables
par(mfrow = c(2, 2))
plot(df$karno, mart_resid,
xlab = "Karnofsky Score",
ylab = "Martingale Residuals",
main = "Martingale Residuals vs Karno")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$karno, mart_resid), col = "blue", lwd = 2)
plot(df$age, mart_resid,
xlab = "Age",
ylab = "Martingale Residuals",
main = "Martingale Residuals vs Age")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$age, mart_resid), col = "blue", lwd = 2)
plot(df$diagtime, mart_resid,
xlab = "Diagnosis Time",
ylab = "Martingale Residuals",
main = "Martingale Residuals vs Diagtime")
abline(h = 0, col = "red", lty = 2)
lines(lowess(df$diagtime, mart_resid), col = "blue", lwd = 2)
par(mfrow = c(1, 1))

Deviance
Residuals
# Calculate deviance residuals
dev_resid <- residuals(cox_mv, type = "deviance")
# Plot deviance residuals
plot(
predict(cox_mv),
dev_resid,
xlab = "Linear Predictor",
ylab = "Deviance Residuals",
main = "Deviance Residuals vs Linear Predictor"
)
abline(h = 0, col = "red", lty = 2)
lines(lowess(predict(cox_mv), dev_resid), col = "blue", lwd = 2)
# Identify potential outliers
outliers <- which(abs(dev_resid) > 3)
if(length(outliers) > 0) {
text(
predict(cox_mv)[outliers],
dev_resid[outliers],
labels = outliers,
pos = 3,
col = "red"
)
}

Influential
Observations (DFBETAs)
# Calculate DFBETAs
dfbeta_vals <- residuals(cox_mv, type = "dfbeta")
# Check column names
colnames(dfbeta_vals)
## NULL
# Plot for key variables (use actual column names)
par(mfrow = c(2, 2))
# Treatment effect
trt_col <- grep("trt", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(trt_col)) {
plot(dfbeta_vals[, trt_col],
ylab = "DFBETAs for Treatment",
main = "Influence on Treatment Effect",
type = "h")
abline(h = 0, col = "red", lty = 2)
}
# Karno effect
karno_col <- grep("karno", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(karno_col)) {
plot(dfbeta_vals[, karno_col],
ylab = "DFBETAs for Karno",
main = "Influence on Karno Effect",
type = "h")
abline(h = 0, col = "red", lty = 2)
}
# Age effect
age_col <- grep("age", colnames(dfbeta_vals), ignore.case = TRUE)[1]
if(!is.na(age_col)) {
plot(dfbeta_vals[, age_col],
ylab = "DFBETAs for Age",
main = "Influence on Age Effect",
type = "h")
abline(h = 0, col = "red", lty = 2)
}
par(mfrow = c(1, 1))
Stratified Cox
Model
If proportional hazards assumption is violated for a variable, we can
stratify by it.
# Example: Stratify by cell type if assumption violated
cox_strat <- coxph(
Surv(time, status) ~ trt + strata(celltype) + karno + diagtime + age + prior,
data = df
)
summary(cox_strat)
## Call:
## coxph(formula = Surv(time, status) ~ trt + strata(celltype) +
## karno + diagtime + age + prior, data = df)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## trtTest 0.285902 1.330962 0.210009 1.361 0.173
## karno -0.038262 0.962461 0.005932 -6.450 1.12e-10 ***
## diagtime -0.003439 0.996567 0.009075 -0.379 0.705
## age -0.011821 0.988249 0.009846 -1.201 0.230
## priorYes 0.169069 1.184201 0.235667 0.717 0.473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## trtTest 1.3310 0.7513 0.8819 2.0087
## karno 0.9625 1.0390 0.9513 0.9737
## diagtime 0.9966 1.0034 0.9790 1.0145
## age 0.9882 1.0119 0.9694 1.0075
## priorYes 1.1842 0.8445 0.7461 1.8794
##
## Concordance= 0.704 (se = 0.027 )
## Likelihood ratio test= 44.27 on 5 df, p=2e-08
## Wald test = 43.63 on 5 df, p=3e-08
## Score (logrank) test = 47.01 on 5 df, p=6e-09
# Compare with unstratified model
data.frame(
Model = c("Unstratified", "Stratified by Cell Type"),
AIC = c(AIC(cox_mv), AIC(cox_strat)),
Concordance = c(
summary(cox_mv)$concordance[1],
summary(cox_strat)$concordance[1]
)
) %>%
kable(caption = "Comparison of Stratified vs Unstratified Models") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Comparison of Stratified vs Unstratified Models
|
Model
|
AIC
|
Concordance
|
|
Unstratified
|
964.7942
|
0.7360291
|
|
Stratified by Cell Type
|
643.2026
|
0.7038814
|
Parametric Survival
Models
Exponential
Model
# Fit exponential model
exp_model <- survreg(
Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
data = df,
dist = "exponential"
)
summary(exp_model)
##
## Call:
## survreg(formula = Surv(time, status) ~ trt + celltype + karno +
## diagtime + age + prior, data = df, dist = "exponential")
## Value Std. Error z p
## (Intercept) 3.188611 0.704033 4.53 5.9e-06
## trtTest -0.219565 0.198634 -1.11 0.2690
## celltypesmallcell -0.820245 0.262111 -3.13 0.0018
## celltypeadeno -1.113121 0.275825 -4.04 5.4e-05
## celltypelarge -0.377220 0.272626 -1.38 0.1665
## karno 0.030624 0.005108 6.00 2.0e-09
## diagtime -0.000297 0.008970 -0.03 0.9736
## age 0.006108 0.009161 0.67 0.5049
## priorYes -0.049482 0.226871 -0.22 0.8273
##
## Scale fixed at 1
##
## Exponential distribution
## Loglik(model)= -716.2 Loglik(intercept only)= -751.2
## Chisq= 70.12 on 8 degrees of freedom, p= 4.6e-12
## Number of Newton-Raphson Iterations: 5
## n= 137
Weibull Model
# Fit Weibull model
weibull_model <- survreg(
Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior,
data = df,
dist = "weibull"
)
summary(weibull_model)
##
## Call:
## survreg(formula = Surv(time, status) ~ trt + celltype + karno +
## diagtime + age + prior, data = df, dist = "weibull")
## Value Std. Error z p
## (Intercept) 3.262014 0.662531 4.92 8.5e-07
## trtTest -0.228523 0.186844 -1.22 0.2213
## celltypesmallcell -0.826185 0.246312 -3.35 0.0008
## celltypeadeno -1.132725 0.257598 -4.40 1.1e-05
## celltypelarge -0.397681 0.254749 -1.56 0.1185
## karno 0.030068 0.004828 6.23 4.7e-10
## diagtime -0.000469 0.008361 -0.06 0.9553
## age 0.006099 0.008553 0.71 0.4758
## priorYes -0.043898 0.212279 -0.21 0.8362
## Log(scale) -0.074599 0.066311 -1.12 0.2606
##
## Scale= 0.928
##
## Weibull distribution
## Loglik(model)= -715.6 Loglik(intercept only)= -748.1
## Chisq= 65.08 on 8 degrees of freedom, p= 4.7e-11
## Number of Newton-Raphson Iterations: 6
## n= 137
# Compare AIC
data.frame(
Model = c("Exponential", "Weibull", "Cox PH"),
AIC = c(AIC(exp_model), AIC(weibull_model), AIC(cox_mv))
) %>%
arrange(AIC) %>%
kable(caption = "Comparison of Parametric Models") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Comparison of Parametric Models
|
Model
|
AIC
|
|
Cox PH
|
964.7942
|
|
Exponential
|
1450.3182
|
|
Weibull
|
1451.1027
|
Risk Score and
Predicted Survival
Calculate Risk
Score
# Calculate risk score (linear predictor)
df$risk_score <- predict(cox_mv, type = "lp")
# Categorize into risk groups
df$risk_group <- cut(
df$risk_score,
breaks = quantile(df$risk_score, probs = c(0, 0.33, 0.67, 1)),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE
)
# Summary by risk group
table(df$risk_group) %>%
kable(col.names = c("Risk Group", "N"),
caption = "Distribution of Risk Groups") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Distribution of Risk Groups
|
Risk Group
|
N
|
|
Low
|
45
|
|
Medium
|
47
|
|
High
|
45
|
Survival by Risk
Group
# Fit survival by risk group
fit_risk <- survfit(Surv(time, status) ~ risk_group, data = df)
# Plot
ggsurvplot(
fit_risk,
data = df,
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE,
risk.table.height = 0.3,
ggtheme = theme_minimal(),
palette = c("green", "orange", "red"),
title = "Survival by Predicted Risk Group",
xlab = "Time (days)",
ylab = "Survival Probability",
break.time.by = 100,
legend.title = "Risk Group",
legend.labs = c("Low Risk", "Medium Risk", "High Risk")
)

Summary and
Conclusions
Key Findings
# Get median survival value
median_val <- survfit(Surv(time, status) ~ 1, data = df)
# Create summary of key results
summary_df <- data.frame(
Analysis = c(
"Median Survival",
"Treatment Effect (HR)",
"Cell Type Effect",
"Karno Score Effect (per unit)",
"Age Effect (per year)",
"Best Model by AIC",
"Proportional Hazards Assumption",
"C-Index (Full Model)"
),
Result = c(
paste0(median_val$median, " days"),
sprintf("%.2f (p = %.3f)",
exp(coef(cox_mv)["trtTest"]),
summary(cox_mv)$coefficients["trtTest", "Pr(>|z|)"]),
"Significant (p < 0.001)",
sprintf("%.3f (p < 0.001)", exp(coef(cox_mv)["karno"])),
sprintf("%.3f (p = %.3f)",
exp(coef(cox_mv)["age"]),
summary(cox_mv)$coefficients["age", "Pr(>|z|)"]),
"Full Model",
ifelse(ph_test$table["GLOBAL", "p"] > 0.05, "Met", "Review Required"),
sprintf("%.3f", summary(cox_mv)$concordance[1])
)
)
summary_df %>%
kable(caption = "Summary of Key Results") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Summary of Key Results
|
Analysis
|
Result
|
|
Median Survival
|
days
|
|
Treatment Effect (HR)
|
1.34 (p = 0.156)
|
|
Cell Type Effect
|
Significant (p < 0.001)
|
|
Karno Score Effect (per unit)
|
0.968 (p < 0.001)
|
|
Age Effect (per year)
|
0.991 (p = 0.349)
|
|
Best Model by AIC
|
Full Model
|
|
Proportional Hazards Assumption
|
Review Required
|
|
C-Index (Full Model)
|
0.736
|