These examples are reproduction of Chapter 4 of the book.
## Load foreign package
library(foreign)
## Load leukemia dataset
leuk <- read.dta("http://www.sph.emory.edu/~dkleinb/surv2datasets/anderson.dta")
## Load survival package
library(survival)
## Create survival vector for fish dataset
leuk$SurvObj <- with(leuk, Surv(survt, status))
The rms package has an extended version of survival plot. The loglog option makes the y-axis log(-log(Survival)), and the logt option makes the x-axis log(time).
library(rms)
## Treatment variable (rx)
survplot(fit = survfit(SurvObj ~ rx, leuk),
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
loglog = TRUE, # log(-log Survival) plot
logt = TRUE, # log time
)
## categorical logWBC variable (lwbc3)
survplot(fit = survfit(SurvObj ~ lwbc3, leuk),
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
loglog = TRUE, # log(-log Survival) plot
logt = TRUE, # log time
)
## Sex variable (sex)
survplot(fit = survfit(SurvObj ~ sex, leuk),
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
loglog = TRUE, # log(-log Survival) plot
logt = TRUE, # log time
)
The sex variable violates proportional hazards assumption.
Adjust for predictors already satisfying PH assumption, i.e., use adjusted log-log \( \hat{S} \) curves.
## Stratified by rx (variable of interest), adjusted for logwbc
fit <- survfit(coxph(SurvObj ~ strata(rx) + lwbc3, data = leuk))
survplot(fit = fit,
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
loglog = TRUE, # log(-log Survival) plot
logt = TRUE, # log time
)
## Stratified by logwbc (variable of interest), adjusted for rx
fit <- survfit(coxph(SurvObj ~ strata(lwbc3) + rx, data = leuk))
survplot(fit = fit,
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
loglog = TRUE, # log(-log Survival) plot
logt = TRUE, # log time
)
## Stratified by sex (variable of interest), adjusted for rx and logwbc
fit <- survfit(coxph(SurvObj ~ strata(sex) + rx + lwbc3, data = leuk))
survplot(fit = fit,
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
loglog = TRUE, # log(-log Survival) plot
logt = TRUE, # log time
)
The sex variable violates proportional hazards assumption.
In this method, the observed curves created by Kaplan-Meier estimator are compared to expected curves created by Cox regression assuming proportional hazards assumption.
### rx variable
## Observed curves
survplot(fit = survfit(SurvObj ~ rx, leuk),
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
lwd = 2
)
## Expected survival plots, using Cox regression with PH assumption
cox.fit <- coxph(SurvObj ~ rx, leuk) # Fit Cox regression (PH assumed)
newdat <- data.frame(rx = 0:1) # newdata to get expected curves
lines(survfit(cox.fit, newdata = newdat),
col = "red", lty = 1:2)
### sex variable
## Observed curves
survplot(fit = survfit(SurvObj ~ sex, leuk),
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
lwd = 2
)
## Expected survival plots, using Cox regression with PH assumption
cox.fit <- coxph(SurvObj ~ sex, leuk) # Fit Cox regression (PH assumed)
newdat <- data.frame(sex = 0:1) # newdata to get expected curves
lines(survfit(cox.fit, newdata = newdat),
col = "red", lty = 1:2)
### logwbc variable
## Observed curves
survplot(fit = survfit(SurvObj ~ lwbc3, leuk),
conf = c("none","bands","bars")[1],
xlab = "",
label.curves = list(keys = "lines"), # legend instead of direct label
lwd = 2
)
## Expected survival plots (red lines, categorical logwbc method)
cox.fit <- coxph(SurvObj ~ lwbc3, leuk) # Fit Cox regression (PH assumed)
newdat <- data.frame(lwbc3 = 1:3)
lines(survfit(cox.fit, newdata = newdat),
col = "red", lty = 1:3)
## Expected survival plots (blue lines, continuous logwbc method)
cox.fit <- coxph(SurvObj ~ logwbc, leuk) # Fit Cox regression (PH assumed)
newdat <- data.frame(logwbc = c(1.71, 2.64, 3.83))
lines(survfit(cox.fit, newdata = newdat),
col = "blue", lty = 1:3)
The expected red lines are reasonably close to the observed thick lines for the rx variable, indicating the PH assumption is reasonable. For the sex variable, these lines are quite different, indicating departure from the PH assumption. For the logwbc variable, the expected curves created with continuous and categorical logwbc were not very different, and these were reasonably close to the observed black curves.
This approach can be more objective. There are several different methods. Basically, the Schoenfeld residuals and rank order of failure times are tested for correlation.
## Fit a model assuming PH for all variables
cox.fit <- coxph(SurvObj ~ rx + logwbc + sex, data = leuk)
## Use cox.zph. The survival times are transformed to ranks.
res.zph <- cox.zph(cox.fit, transform = c("km","rank","idenityt")[2])
## Print test results
res.zph
rho chisq p
rx -0.1075 0.384 0.5356
logwbc 0.0496 0.112 0.7383
sex -0.3811 4.361 0.0368
GLOBAL NA 4.473 0.2147
## scaled Schoenfeld residuals vs
## Plotting can be useful. A non-horizontal trend means changes in HR over time.
##
plot(res.zph)
The sex variable violates proportional hazard assumption (p=0.0435 by km, 0.0368 by rank).
If the interaction term between time function and a covariate of interest is not significant in the extended Cox model, there is no evidence for violation of PH specifided by the interaction term.
## Add id to indicate clusters
leuk$id <- as.numeric(rownames(leuk))
## Split a survival data set at specified times to form a counting process format
leuk.cp.format <- survSplit(data = leuk,
cut = c(7), # cut at time 7
end = "survt", # original survival time
event = "status", # event indicator
start = "start") # will be created. zero by default
## Somehow there are duplicated lines. Compeltely identical lines are deleted.
leuk.cp.format <- unique(leuk.cp.format)
## Recoding
leuk.cp.format <- within(leuk.cp.format, {
## Create new survival object
SurvObj <- Surv(start, survt, status)
## Create interval indicator
interval <- factor(start, levels = c(0,7), labels = c("First","Second"))
})
## Reordering
leuk.cp.format <- leuk.cp.format[with(leuk.cp.format, order(id, survt)),]
rownames(leuk.cp.format) <- NULL
head(leuk.cp.format, 15)
survt status sex logwbc rx lwbc3 SurvObj id start interval
1 7 0 1 1.45 0 1 (0, 7+] 1 0 First
2 35 0 1 1.45 0 1 (7,35+] 1 7 Second
3 7 0 1 1.47 0 1 (0, 7+] 2 0 First
4 34 0 1 1.47 0 1 (7,34+] 2 7 Second
5 7 0 1 2.20 0 1 (0, 7+] 3 0 First
6 32 0 1 2.20 0 1 (7,32+] 3 7 Second
7 7 0 1 2.53 0 2 (0, 7+] 4 0 First
8 32 0 1 2.53 0 2 (7,32+] 4 7 Second
9 7 0 1 1.78 0 1 (0, 7+] 5 0 First
10 25 0 1 1.78 0 1 (7,25+] 5 7 Second
11 7 0 1 2.57 0 2 (0, 7+] 6 0 First
12 23 1 1 2.57 0 2 (7,23] 6 7 Second
13 7 0 1 2.32 0 2 (0, 7+] 7 0 First
14 22 1 1 2.32 0 2 (7,22] 7 7 Second
15 7 0 1 2.01 0 1 (0, 7+] 8 0 First
## Fit extended Cox model with multiple interaction terms
## cluster(id) for robutst SE to account for within-cluster non-independence
res.extended.cox <- coxph(SurvObj ~ rx + logwbc + sex + rx:interval + logwbc:interval + sex:interval + cluster(id),
data = leuk.cp.format)
summary(res.extended.cox)
Call:
coxph(formula = SurvObj ~ rx + logwbc + sex + rx:interval + logwbc:interval +
sex:interval + cluster(id), data = leuk.cp.format)
n= 70, number of events= 30
coef exp(coef) se(coef) robust se z Pr(>|z|)
rx 1.008 2.740 0.657 0.517 1.95 0.05121 .
logwbc 1.674 5.335 0.443 0.498 3.36 0.00078 ***
sex 1.253 3.503 0.652 0.509 2.46 0.01384 *
rx:intervalSecond 0.297 1.346 0.954 0.893 0.33 0.73915
logwbc:intervalSecond -0.286 0.751 0.712 0.699 -0.41 0.68203
sex:intervalSecond -2.006 0.134 1.048 1.048 -1.91 0.05551 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rx 2.740 0.365 0.9947 7.55
logwbc 5.335 0.187 2.0088 14.17
sex 3.503 0.286 1.2909 9.50
rx:intervalSecond 1.346 0.743 0.2341 7.74
logwbc:intervalSecond 0.751 1.331 0.1909 2.95
sex:intervalSecond 0.134 7.436 0.0173 1.05
Concordance= 0.867 (se = 0.062 )
Rsquare= 0.527 (max possible= 0.93 )
Likelihood ratio test= 52.4 on 6 df, p=0.00000000152
Wald test = 58.9 on 6 df, p=7.61e-11
Score (logrank) test = 58.1 on 6 df, p=1.11e-10, Robust = 29.9 p=0.000041
(Note: the likelihood ratio and score tests assume independence of
observations within a cluster, the Wald and robust score tests do not).
## Testing the sex variable only
## cluster(id) for robutst SE to account for within-cluster non-independence
res.extended.cox <- coxph(SurvObj ~ rx + logwbc + sex + sex:interval + cluster(id),
data = leuk.cp.format)
summary(res.extended.cox)
Call:
coxph(formula = SurvObj ~ rx + logwbc + sex + sex:interval +
cluster(id), data = leuk.cp.format)
n= 70, number of events= 30
coef exp(coef) se(coef) robust se z Pr(>|z|)
rx 1.139 3.123 0.481 0.341 3.34 0.00085 ***
logwbc 1.571 4.813 0.345 0.361 4.35 0.000014 ***
sex 1.277 3.586 0.650 0.511 2.50 0.01240 *
sex:intervalSecond -2.113 0.121 0.982 0.839 -2.52 0.01181 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rx 3.123 0.320 1.5994 6.098
logwbc 4.813 0.208 2.3717 9.769
sex 3.586 0.279 1.3180 9.759
sex:intervalSecond 0.121 8.270 0.0233 0.626
Concordance= 0.861 (se = 0.062 )
Rsquare= 0.525 (max possible= 0.93 )
Likelihood ratio test= 52.2 on 4 df, p=1.26e-10
Wald test = 49.3 on 4 df, p=5.16e-10
Score (logrank) test = 56.6 on 4 df, p=1.51e-11, Robust = 28.7 p=0.0000089
(Note: the likelihood ratio and score tests assume independence of
observations within a cluster, the Wald and robust score tests do not).
In the extended Cox model with multiple interaction terms, the sex-interval interaction is close to statistical significance suggesting violation of the proportional hazards assumption. In the model with only one interaction term, the sex-interval interaction is statistically significant, confirming violation of the proportional hazards assumption by the sex variable.