# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 1 Day 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Ovarian Cancer Survival Data Example: 
# A randomized clinical trial was performed to compare two treatments of patients with 
# advanced ovarian carcinoma (stages IIIB and IV) using either cyclophosphamide alone 
# (1 g/m2) or cyclophosphamide (500 mg/m2) plus adriamycin (40 mg/m2) by iv injection 
# every 3 weeks. 

# Data Set: ovarian

# Data Dictionary: 
# (1) futime:       time to death or censoring post-randomization (days)
# (2) fustat:       censoring status (1 = died; 0 = censored)
# (3) age:        in years
# (4) resid.ds:   residual disease present (1=no; 2=yes)
# (5) rx:           treatment group (1=cyclophosphamide alone; 2=cyclophosphamide+adriamycin)
# (6) ecog.ps:    ECOG performance status (1 is better;
#                 see reference: http://ecog.org/general/perf_stat.html)

## Download and load the data file = ovarian
download.file("http://www.duke.edu/~sgrambow/crp241data/ovarian2.RData",
              destfile = "ovarian2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("ovarian2.RData")
# Install and load package for Survival Analysis 
# - Recall: You only need to install the package ONCE, but you must load the 
#           package EVERY time you want to use it!
# install.packages('survival')
# we need the survival package for performing
# survival analyses
library(survival)
# install.packages('surminer')
# we need this package for the ggsurvplot function
# which creates KM plots.
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
# Example -- Ovarian Cancer Data

# Question 1 
# Obtain the table of Kaplan Meier survival estimates for the observed data set. 
# - Obtain survival estimates using survfit() function
fit <- survfit(Surv(futime, fustat)~1, data=ovarian2)

# - Print table of survival estimates
# -- Default ouput only shows UNIQUE EVENT times 
summary(fit)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    59     26       1    0.962  0.0377        0.890        1.000
##   115     25       1    0.923  0.0523        0.826        1.000
##   156     24       1    0.885  0.0627        0.770        1.000
##   268     23       1    0.846  0.0708        0.718        0.997
##   329     22       1    0.808  0.0773        0.670        0.974
##   353     21       1    0.769  0.0826        0.623        0.949
##   365     20       1    0.731  0.0870        0.579        0.923
##   431     17       1    0.688  0.0919        0.529        0.894
##   464     15       1    0.642  0.0965        0.478        0.862
##   475     14       1    0.596  0.0999        0.429        0.828
##   563     12       1    0.546  0.1032        0.377        0.791
##   638     11       1    0.497  0.1051        0.328        0.752
# -- Have to ask R to show censored event times
summary(fit,censor=TRUE)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    59     26       1    0.962  0.0377        0.890        1.000
##   115     25       1    0.923  0.0523        0.826        1.000
##   156     24       1    0.885  0.0627        0.770        1.000
##   268     23       1    0.846  0.0708        0.718        0.997
##   329     22       1    0.808  0.0773        0.670        0.974
##   353     21       1    0.769  0.0826        0.623        0.949
##   365     20       1    0.731  0.0870        0.579        0.923
##   377     19       0    0.731  0.0870        0.579        0.923
##   421     18       0    0.731  0.0870        0.579        0.923
##   431     17       1    0.688  0.0919        0.529        0.894
##   448     16       0    0.688  0.0919        0.529        0.894
##   464     15       1    0.642  0.0965        0.478        0.862
##   475     14       1    0.596  0.0999        0.429        0.828
##   477     13       0    0.596  0.0999        0.429        0.828
##   563     12       1    0.546  0.1032        0.377        0.791
##   638     11       1    0.497  0.1051        0.328        0.752
##   744     10       0    0.497  0.1051        0.328        0.752
##   769      9       0    0.497  0.1051        0.328        0.752
##   770      8       0    0.497  0.1051        0.328        0.752
##   803      7       0    0.497  0.1051        0.328        0.752
##   855      6       0    0.497  0.1051        0.328        0.752
##  1040      5       0    0.497  0.1051        0.328        0.752
##  1106      4       0    0.497  0.1051        0.328        0.752
##  1129      3       0    0.497  0.1051        0.328        0.752
##  1206      2       0    0.497  0.1051        0.328        0.752
##  1227      1       0    0.497  0.1051        0.328        0.752
# generates median and 95% CI for median
summary(fit)$table 
##    records      n.max    n.start     events     *rmean *se(rmean)     median 
##   26.00000   26.00000   26.00000   12.00000  793.99930   91.51788  638.00000 
##    0.95LCL    0.95UCL 
##  464.00000         NA
# Question 2
# Create the Kaplan Meier plot that estimates the true survival curve.
# see this link for info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/
ggsurvplot(fit, data = ovarian2, risk.table = TRUE, surv.median.line = "hv")

# Question 3
# Using the output from Question 1 and 2, what is the estimated 
# probability that a patient with advanced ovarian carcinoma will 
# survive at least 2 years? 


# Question 4
# Using the output from Question 1 and 2, what is the 
# estimated probability that a patient with advanced 
# ovarian carcinoma will die before 6 months? 
# End of Program