# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Ovarian Cancer Survival Data Example:
# A randomized clinical trial was performed to
# compare two treatments of patients with advanced
# ovarian carcinoma (stages IIIB and IV) using
# either cyclophosphamide alone (1 g/m2) or
# cyclophosphamide (500 mg/m2) plus adriamycin
# (40 mg/m2) by iv injection every 3 weeks.
# Data Set: ovarian2
# Data Dictionary:
# (1) futime: time to death or censoring post-randomization (days)
# (2) fustat: censoring status (1 = died; 0 = censored)
# (3) age: in years
# (4) resid.ds: residual disease present (1=no; 2=yes)
# (5) rx: treatment group (1=cyclophosphamide alone; 2=cyclophosphamide+adriamycin)
# (6) ecog.ps: ECOG performance status (1 is better;
# see reference: http://ecog.org/general/perf_stat.html)
## Download and load the data file = ovarian
load(url("http://www.duke.edu/~sgrambow/crp241data/ovarian2.RData"))
# Examine Data
# examine structure of dataset and variables
str(ovarian2)
## 'data.frame': 26 obs. of 6 variables:
## $ futime : num 59 115 156 421 431 448 464 475 477 563 ...
## $ fustat : num 1 1 1 0 1 0 1 1 0 1 ...
## $ age : num 72.3 74.5 66.5 53.4 50.3 ...
## $ resid.ds: num 2 2 2 2 2 1 2 2 2 1 ...
## $ rx : num 0 0 0 1 0 0 1 1 0 1 ...
## $ ecog.ps : num 1 1 2 1 1 2 2 2 1 2 ...
# look at summary statistics
summary(ovarian2)
## futime fustat age resid.ds
## Min. : 59.0 Min. :0.0000 Min. :38.89 Min. :1.000
## 1st Qu.: 368.0 1st Qu.:0.0000 1st Qu.:50.17 1st Qu.:1.000
## Median : 476.0 Median :0.0000 Median :56.85 Median :2.000
## Mean : 599.5 Mean :0.4615 Mean :56.17 Mean :1.577
## 3rd Qu.: 794.8 3rd Qu.:1.0000 3rd Qu.:62.38 3rd Qu.:2.000
## Max. :1227.0 Max. :1.0000 Max. :74.50 Max. :2.000
## rx ecog.ps
## Min. :0.0 Min. :1.000
## 1st Qu.:0.0 1st Qu.:1.000
## Median :0.5 Median :1.000
## Mean :0.5 Mean :1.462
## 3rd Qu.:1.0 3rd Qu.:2.000
## Max. :1.0 Max. :2.000
# visualize distribution of survival times
# use jitter to avoid stacking points
stripchart(ovarian2$futime,
main="time to death or censoring post-randomization (days)",
xlab="days",
ylab="time to death-censoring ",
method="jitter",
col="blue",
pch=19
)
# Install and load package for Survival Analysis
# - Recall: You only need to install the package ONCE, but you must load the
# package EVERY time you want to use it!
# install.packages('survival')
# we need the survival package for performing
# survival analyses
library(survival)

# install.packages('survminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot
library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
# Example -- Ovarian Cancer Data
# Question 1
# Obtain the table of Kaplan Meier survival
# estimates for the observed data set.
# Obtain survival estimates using survfit() function
fit.km <- survfit(Surv(futime, fustat)~1, data=ovarian2)
# - Print table of survival estimates
# -- Default output only shows UNIQUE EVENT times
summary(fit.km)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 59 26 1 0.962 0.0377 0.890 1.000
## 115 25 1 0.923 0.0523 0.826 1.000
## 156 24 1 0.885 0.0627 0.770 1.000
## 268 23 1 0.846 0.0708 0.718 0.997
## 329 22 1 0.808 0.0773 0.670 0.974
## 353 21 1 0.769 0.0826 0.623 0.949
## 365 20 1 0.731 0.0870 0.579 0.923
## 431 17 1 0.688 0.0919 0.529 0.894
## 464 15 1 0.642 0.0965 0.478 0.862
## 475 14 1 0.596 0.0999 0.429 0.828
## 563 12 1 0.546 0.1032 0.377 0.791
## 638 11 1 0.497 0.1051 0.328 0.752
# -- Have to ask R to show censored event times
summary(fit.km,censor=TRUE)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 59 26 1 0.962 0.0377 0.890 1.000
## 115 25 1 0.923 0.0523 0.826 1.000
## 156 24 1 0.885 0.0627 0.770 1.000
## 268 23 1 0.846 0.0708 0.718 0.997
## 329 22 1 0.808 0.0773 0.670 0.974
## 353 21 1 0.769 0.0826 0.623 0.949
## 365 20 1 0.731 0.0870 0.579 0.923
## 377 19 0 0.731 0.0870 0.579 0.923
## 421 18 0 0.731 0.0870 0.579 0.923
## 431 17 1 0.688 0.0919 0.529 0.894
## 448 16 0 0.688 0.0919 0.529 0.894
## 464 15 1 0.642 0.0965 0.478 0.862
## 475 14 1 0.596 0.0999 0.429 0.828
## 477 13 0 0.596 0.0999 0.429 0.828
## 563 12 1 0.546 0.1032 0.377 0.791
## 638 11 1 0.497 0.1051 0.328 0.752
## 744 10 0 0.497 0.1051 0.328 0.752
## 769 9 0 0.497 0.1051 0.328 0.752
## 770 8 0 0.497 0.1051 0.328 0.752
## 803 7 0 0.497 0.1051 0.328 0.752
## 855 6 0 0.497 0.1051 0.328 0.752
## 1040 5 0 0.497 0.1051 0.328 0.752
## 1106 4 0 0.497 0.1051 0.328 0.752
## 1129 3 0 0.497 0.1051 0.328 0.752
## 1206 2 0 0.497 0.1051 0.328 0.752
## 1227 1 0 0.497 0.1051 0.328 0.752
# generates median and 95% CI for median
summary(fit.km)$table
## records n.max n.start events *rmean *se(rmean) median
## 26.00000 26.00000 26.00000 12.00000 793.99930 91.51788 638.00000
## 0.95LCL 0.95UCL
## 464.00000 NA
# Question 2
# Create the Kaplan Meier plot that estimates the true survival curve.
# see this link for info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/
# basic KM plot
plot(fit.km)
# better KM plot
ggsurvplot(fit.km, data = ovarian2,
risk.table = TRUE,
surv.median.line = "hv")


# Question 3 & 4
# Q3 - Using the output from Question 1 and 2, what is the estimated
# probability that a patient with advanced ovarian carcinoma will
# survive at least 2 years?
# Q4 - Using the output from Question 1 and 2, what is the
# estimated probability ovarian carcinoma will die before 6 months?
# - futime is recorded in days
# - 2 years = 365*2 = 730 days
# - 6 months = 30*6 = 180 days
summary(fit.km)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 59 26 1 0.962 0.0377 0.890 1.000
## 115 25 1 0.923 0.0523 0.826 1.000
## 156 24 1 0.885 0.0627 0.770 1.000
## 268 23 1 0.846 0.0708 0.718 0.997
## 329 22 1 0.808 0.0773 0.670 0.974
## 353 21 1 0.769 0.0826 0.623 0.949
## 365 20 1 0.731 0.0870 0.579 0.923
## 431 17 1 0.688 0.0919 0.529 0.894
## 464 15 1 0.642 0.0965 0.478 0.862
## 475 14 1 0.596 0.0999 0.429 0.828
## 563 12 1 0.546 0.1032 0.377 0.791
## 638 11 1 0.497 0.1051 0.328 0.752
# - Output from summary(fit)
# > summary(fit)
#
# time n.risk n.event survival std.err
# 59 26 1 0.962 0.0377
# 115 25 1 0.923 0.0523
# 156 24 1 0.885 0.0627
# 268 23 1 0.846 0.0708
# 329 22 1 0.808 0.0773
# 353 21 1 0.769 0.0826
# 365 20 1 0.731 0.0870
# 431 17 1 0.688 0.0919
# 464 15 1 0.642 0.0965
# 475 14 1 0.596 0.0999
# 563 12 1 0.546 0.1032
# 638 11 1 0.497 0.1051
# - Q3 Answer:
# -- P(Y > 730 days) = P(Y > 638 days) = 0.497
# - Q4 Answer:
# -- P(Y < 180 days) = 1 - P(Y > 180 days) = 1 - 0.885 = 0.115
# End of Program
# Making KM plots look pretty --
# here are ggsurvplot options
# ggsurvplot(
# fit, # survfit object with calculated statistics.
# data = fit, # data used to fit survival curves.
# risk.table = TRUE, # show risk table.
# pval = TRUE, # show p-value of log-rank test.
# conf.int = TRUE, # show confidence intervals for
# # point estimaes of survival curves.
# xlim = c(0,2000), # present narrower X axis, but not affect survival estimates.
# break.time.by = 500, # break X axis in time intervals by 500.
# ggtheme = theme_minimal(), # customize plot and risk table with a theme.
# risk.table.y.text.col = T, # colour risk table text annotations.
# risk.table.y.text = FALSE # show bars instead of names in text annotations
# # in legend of risk table
# risk.table.col = "strata", # Change risk table color by groups
# linetype = "strata", # Change line type by groups
# surv.median.line = "hv", # Specify median survival