Introduction:
https://www.r-bloggers.com/survival-analysis-with-r-3/ This blog post is about two different treatments for lung cancer and the survival rates of those patients over time. Plots start rather basic, beginning with a Kaplan-Meier plot that estimates of the probability of survival over time in general for all patient and treatment types (Plot 1). They then go on to split the survival curve up by treatment (Plot 2). After they change the graph to show survival rate by age (Plot 3). There are many more covariates in this data set and they combine them into one plot that shows how how the effects of the covariates change over time, using a Cox Proportional Hazards Model(Plot 4). This data is then merged into one graph to represent patient Survival rate by days and is factored by the variables from the last plot using a Random Forests Model. The black line in this new plot represents the average (Plot 5). The last graph that they make combines all three types of survival curves created previously to give an “eyeball comparison” between each type of survival cure created (Plot 6).
Analysis:
library(survival)
library(ranger)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggfortify)
#------------
data(veteran)
head(veteran)
## trt celltype time status karno diagtime age prior
## 1 1 squamous 72 1 60 7 69 0
## 2 1 squamous 411 1 70 5 64 10
## 3 1 squamous 228 1 60 3 38 0
## 4 1 squamous 126 1 60 9 63 10
## 5 1 squamous 118 1 70 11 65 10
## 6 1 squamous 10 1 20 5 49 0
# Kaplan Meier Survival Curve
km <- with(veteran, Surv(time, status))
head(km,80)
## [1] 72 411 228 126 118 10 82 110 314 100+ 42 8 144 25+
## [15] 11 30 384 4 54 13 123+ 97+ 153 59 117 16 151 22
## [29] 56 21 18 139 20 31 52 287 18 51 122 27 54 7
## [43] 63 392 10 8 92 35 117 132 12 162 3 95 177 162
## [57] 216 553 278 12 260 200 156 182+ 143 105 103 250 100 999
## [71] 112 87+ 231+ 242 991 111 1 587 389 33
km_fit <- survfit(Surv(time, status) ~ 1, data=veteran)
summary(km_fit, times = c(1,30,60,90*(1:10)))
## Call: survfit(formula = Surv(time, status) ~ 1, data = veteran)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 137 2 0.985 0.0102 0.96552 1.0000
## 30 97 39 0.700 0.0392 0.62774 0.7816
## 60 73 22 0.538 0.0427 0.46070 0.6288
## 90 62 10 0.464 0.0428 0.38731 0.5560
## 180 27 30 0.222 0.0369 0.16066 0.3079
## 270 16 9 0.144 0.0319 0.09338 0.2223
## 360 10 6 0.090 0.0265 0.05061 0.1602
## 450 5 5 0.045 0.0194 0.01931 0.1049
## 540 4 1 0.036 0.0175 0.01389 0.0934
## 630 2 2 0.018 0.0126 0.00459 0.0707
## 720 2 0 0.018 0.0126 0.00459 0.0707
## 810 2 0 0.018 0.0126 0.00459 0.0707
## 900 2 0 0.018 0.0126 0.00459 0.0707
## Call: survfit(formula = Surv(time, status) ~ 1, data = veteran)
#plot(km_fit, xlab="Days", main = 'Kaplan Meyer Plot') #base graphics is always ready
autoplot(km_fit)
km_trt_fit <- survfit(Surv(time, status) ~ trt, data=veteran)
autoplot(km_trt_fit)
vet <- mutate(veteran, AG = ifelse((age < 60), "LT60", "OV60"),
AG = factor(AG),
trt = factor(trt,labels=c("standard","test")),
prior = factor(prior,labels=c("N0","Yes")))
km_AG_fit <- survfit(Surv(time, status) ~ AG, data=vet)
autoplot(km_AG_fit)
# Fit Cox Model
cox <- coxph(Surv(time, status) ~ trt + celltype + karno + diagtime + age + prior , data = vet)
summary(cox)
## Call:
## coxph(formula = Surv(time, status) ~ trt + celltype + karno +
## diagtime + age + prior, data = vet)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## trttest 2.946e-01 1.343e+00 2.075e-01 1.419 0.15577
## celltypesmallcell 8.616e-01 2.367e+00 2.753e-01 3.130 0.00175 **
## celltypeadeno 1.196e+00 3.307e+00 3.009e-01 3.975 7.05e-05 ***
## celltypelarge 4.013e-01 1.494e+00 2.827e-01 1.420 0.15574
## karno -3.282e-02 9.677e-01 5.508e-03 -5.958 2.55e-09 ***
## diagtime 8.132e-05 1.000e+00 9.136e-03 0.009 0.99290
## age -8.706e-03 9.913e-01 9.300e-03 -0.936 0.34920
## priorYes 7.159e-02 1.074e+00 2.323e-01 0.308 0.75794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## trttest 1.3426 0.7448 0.8939 2.0166
## celltypesmallcell 2.3669 0.4225 1.3799 4.0597
## celltypeadeno 3.3071 0.3024 1.8336 5.9647
## celltypelarge 1.4938 0.6695 0.8583 2.5996
## karno 0.9677 1.0334 0.9573 0.9782
## diagtime 1.0001 0.9999 0.9823 1.0182
## age 0.9913 1.0087 0.9734 1.0096
## priorYes 1.0742 0.9309 0.6813 1.6937
##
## Concordance= 0.736 (se = 0.03 )
## Rsquare= 0.364 (max possible= 0.999 )
## Likelihood ratio test= 62.1 on 8 df, p=1.799e-10
## Wald test = 62.37 on 8 df, p=1.596e-10
## Score (logrank) test = 66.74 on 8 df, p=2.186e-11
cox_fit <- survfit(cox)
#plot(cox_fit, main = "cph model", xlab="Days")
autoplot(cox_fit)
aa_fit <-aareg(Surv(time, status) ~ trt + celltype +
karno + diagtime + age + prior ,
data = vet)
aa_fit
## Call:
## aareg(formula = Surv(time, status) ~ trt + celltype + karno +
## diagtime + age + prior, data = vet)
##
## n= 137
## 75 out of 97 unique event times used
##
## slope coef se(coef) z p
## Intercept 0.083400 3.81e-02 1.09e-02 3.490 4.79e-04
## trttest 0.006730 2.49e-03 2.58e-03 0.967 3.34e-01
## celltypesmallcell 0.015000 7.30e-03 3.38e-03 2.160 3.09e-02
## celltypeadeno 0.018400 1.03e-02 4.20e-03 2.450 1.42e-02
## celltypelarge -0.001090 -6.21e-04 2.71e-03 -0.229 8.19e-01
## karno -0.001180 -4.37e-04 8.77e-05 -4.980 6.28e-07
## diagtime -0.000243 -4.92e-05 1.64e-04 -0.300 7.65e-01
## age -0.000246 -6.27e-05 1.28e-04 -0.491 6.23e-01
## priorYes 0.003300 1.54e-03 2.86e-03 0.539 5.90e-01
##
## Chisq=41.62 on 8 df, p=1.6e-06; test weights=aalen
#summary(aa_fit) # provides a more complete summary of results
autoplot(aa_fit)
# ranger model
r_fit <- ranger(Surv(time, status) ~ trt + celltype +
karno + diagtime + age + prior,
data = vet,
mtry = 4,
importance = "permutation",
splitrule = "extratrees",
verbose = TRUE)
# Average the survival models
death_times <- r_fit$unique.death.times
surv_prob <- data.frame(r_fit$survival)
avg_prob <- sapply(surv_prob,mean)
# Plot the survival models for each patient
plot(r_fit$unique.death.times,r_fit$survival[1,],
type = "l",
ylim = c(0,1),
col = "red",
xlab = "Days",
ylab = "survival",
main = "Patient Survival Curves")
#
cols <- colors()
for (n in sample(c(2:dim(vet)[1]), 20)){
lines(r_fit$unique.death.times, r_fit$survival[n,], type = "l", col = cols[n])
}
lines(death_times, avg_prob, lwd = 2)
legend(500, 0.7, legend = c('Average = black'))
vi <- data.frame(sort(round(r_fit$variable.importance, 4), decreasing = TRUE))
names(vi) <- "importance"
head(vi)
## importance
## karno 0.0679
## celltype 0.0304
## diagtime 0.0013
## trt 0.0010
## prior -0.0021
## age -0.0035
cat("Prediction Error = 1 - Harrell's c-index = ", r_fit$prediction.error)
## Prediction Error = 1 - Harrell's c-index = 0.3071331
## Prediction Error = 1 - Harrell's c-index = 0.308269
# Set up for ggplot
kmi <- rep("KM",length(km_fit$time))
km_df <- data.frame(km_fit$time,km_fit$surv,kmi)
names(km_df) <- c("Time","Surv","Model")
coxi <- rep("Cox",length(cox_fit$time))
cox_df <- data.frame(cox_fit$time,cox_fit$surv,coxi)
names(cox_df) <- c("Time","Surv","Model")
rfi <- rep("RF",length(r_fit$unique.death.times))
rf_df <- data.frame(r_fit$unique.death.times,avg_prob,rfi)
names(rf_df) <- c("Time","Surv","Model")
plot_df <- rbind(km_df,cox_df,rf_df)
p <- ggplot(plot_df, aes(x = Time, y = Surv, color = Model))
p + geom_line()
Follow Up:
We decided to make another graph from the data that was given to us. It is a scatter plot of Survival Time as a function of Age, categorized by treatment type. and the colors and different numbers represent the two different types of treatments. The vertical line represents the median age, and the horizontal lines show the average number of days each treatment group survived after with the treatment.
library(ggplot2)
p = ggplot(data = veteran, aes(x=age, y=time, color=factor(trt,labels =
c("Treatment 1",
"Treatment 2"))))
p + geom_text(aes(label = veteran$trt)) +
geom_hline(aes(mean(veteran$time)),
yintercept = mean(veteran$time[veteran$trt == 1]), color = "coral") +
geom_hline(aes(mean(veteran$time)),
yintercept = mean(veteran$time[veteran$trt == 2]), color = "cyan") +
geom_vline(aes(median(veteran$age)),
xintercept = median(veteran$age[veteran$trt == 1]),color = "coral") +
geom_vline(aes(median(veteran$age)),
xintercept = median(veteran$age[veteran$trt == 2]),color = "cyan") +
ggtitle("Survival Time by Age and Treatment Type") +
labs(color = "Treatment Type") + xlab("Age (in years)") + ylab("Time (in days)")
mean(veteran$time[veteran$trt == 1]) #about 115.145 days
## [1] 115.1449
mean(veteran$time[veteran$trt == 2]) #about 128.206 days
## [1] 128.2059
median(veteran$age[veteran$trt == 1])#62 years old
## [1] 62
median(veteran$age[veteran$trt == 2])#also 62 years old
## [1] 62
Discussion:
Next time we think the best course of action would be to stick with Rcode that is in our realm of understanding. The first set of code we tried to dissect and redeploy was a little too advanced for our skill set at this time. If we chose code that we understood fully at first it would allow us more time to run other tests and make more useful adjustments and find new insights quicker.