# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Ovarian Cancer Survival Data Example: 
# A randomized clinical trial was performed to 
# compare two treatments of patients with advanced 
# ovarian carcinoma (stages IIIB and IV) using 
# either cyclophosphamide alone (1 g/m2) or 
# cyclophosphamide (500 mg/m2) plus adriamycin 
# (40 mg/m2) by iv injection every 3 weeks. 

# Data Set: ovarian2

# Data Dictionary: 
# (1) futime:       time to death or censoring post-randomization (days)
# (2) fustat:       censoring status (1 = died; 0 = censored)
# (3) age:        in years
# (4) resid.ds:   residual disease present (1=no; 2=yes)
# (5) rx:           treatment group (1=cyclophosphamide alone; 2=cyclophosphamide+adriamycin)
# (6) ecog.ps:    ECOG performance status (1 is better;
#                 see reference: http://ecog.org/general/perf_stat.html)

## Download and load the data file = ovarian
load(url("http://www.duke.edu/~sgrambow/crp241data/ovarian2.RData"))
# Examine Data

# examine structure of dataset and variables
str(ovarian2)
## 'data.frame':    26 obs. of  6 variables:
##  $ futime  : num  59 115 156 421 431 448 464 475 477 563 ...
##  $ fustat  : num  1 1 1 0 1 0 1 1 0 1 ...
##  $ age     : num  72.3 74.5 66.5 53.4 50.3 ...
##  $ resid.ds: num  2 2 2 2 2 1 2 2 2 1 ...
##  $ rx      : num  0 0 0 1 0 0 1 1 0 1 ...
##  $ ecog.ps : num  1 1 2 1 1 2 2 2 1 2 ...
# look at summary statistics
summary(ovarian2)
##      futime           fustat            age           resid.ds    
##  Min.   :  59.0   Min.   :0.0000   Min.   :38.89   Min.   :1.000  
##  1st Qu.: 368.0   1st Qu.:0.0000   1st Qu.:50.17   1st Qu.:1.000  
##  Median : 476.0   Median :0.0000   Median :56.85   Median :2.000  
##  Mean   : 599.5   Mean   :0.4615   Mean   :56.17   Mean   :1.577  
##  3rd Qu.: 794.8   3rd Qu.:1.0000   3rd Qu.:62.38   3rd Qu.:2.000  
##  Max.   :1227.0   Max.   :1.0000   Max.   :74.50   Max.   :2.000  
##        rx         ecog.ps     
##  Min.   :0.0   Min.   :1.000  
##  1st Qu.:0.0   1st Qu.:1.000  
##  Median :0.5   Median :1.000  
##  Mean   :0.5   Mean   :1.462  
##  3rd Qu.:1.0   3rd Qu.:2.000  
##  Max.   :1.0   Max.   :2.000
# visualize distribution of survival times
# use jitter to avoid stacking points
stripchart(ovarian2$futime,
           main="time to death or censoring post-randomization (days)",
           xlab="days",
           ylab="time to death-censoring ",
           method="jitter",
           col="blue",
           pch=19
)

# Install and load package for Survival Analysis 
# - Recall: You only need to install the package ONCE, but you must load the 
#           package EVERY time you want to use it!
# install.packages('survival')
# we need the survival package for performing
# survival analyses
library(survival)

# install.packages('survminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot
library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
# Example -- Ovarian Cancer Data

# Question 1 
# Obtain the table of Kaplan Meier survival 
# estimates for the observed data set. 


# Obtain survival estimates using survfit() function

fit.km <- survfit(Surv(futime, fustat)~1, data=ovarian2)

# - Print table of survival estimates
# -- Default output only shows UNIQUE EVENT times 

summary(fit.km)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    59     26       1    0.962  0.0377        0.890        1.000
##   115     25       1    0.923  0.0523        0.826        1.000
##   156     24       1    0.885  0.0627        0.770        1.000
##   268     23       1    0.846  0.0708        0.718        0.997
##   329     22       1    0.808  0.0773        0.670        0.974
##   353     21       1    0.769  0.0826        0.623        0.949
##   365     20       1    0.731  0.0870        0.579        0.923
##   431     17       1    0.688  0.0919        0.529        0.894
##   464     15       1    0.642  0.0965        0.478        0.862
##   475     14       1    0.596  0.0999        0.429        0.828
##   563     12       1    0.546  0.1032        0.377        0.791
##   638     11       1    0.497  0.1051        0.328        0.752
# -- Have to ask R to show censored event times

summary(fit.km,censor=TRUE)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    59     26       1    0.962  0.0377        0.890        1.000
##   115     25       1    0.923  0.0523        0.826        1.000
##   156     24       1    0.885  0.0627        0.770        1.000
##   268     23       1    0.846  0.0708        0.718        0.997
##   329     22       1    0.808  0.0773        0.670        0.974
##   353     21       1    0.769  0.0826        0.623        0.949
##   365     20       1    0.731  0.0870        0.579        0.923
##   377     19       0    0.731  0.0870        0.579        0.923
##   421     18       0    0.731  0.0870        0.579        0.923
##   431     17       1    0.688  0.0919        0.529        0.894
##   448     16       0    0.688  0.0919        0.529        0.894
##   464     15       1    0.642  0.0965        0.478        0.862
##   475     14       1    0.596  0.0999        0.429        0.828
##   477     13       0    0.596  0.0999        0.429        0.828
##   563     12       1    0.546  0.1032        0.377        0.791
##   638     11       1    0.497  0.1051        0.328        0.752
##   744     10       0    0.497  0.1051        0.328        0.752
##   769      9       0    0.497  0.1051        0.328        0.752
##   770      8       0    0.497  0.1051        0.328        0.752
##   803      7       0    0.497  0.1051        0.328        0.752
##   855      6       0    0.497  0.1051        0.328        0.752
##  1040      5       0    0.497  0.1051        0.328        0.752
##  1106      4       0    0.497  0.1051        0.328        0.752
##  1129      3       0    0.497  0.1051        0.328        0.752
##  1206      2       0    0.497  0.1051        0.328        0.752
##  1227      1       0    0.497  0.1051        0.328        0.752
# generates median and 95% CI for median

summary(fit.km)$table 
##    records      n.max    n.start     events     *rmean *se(rmean)     median 
##   26.00000   26.00000   26.00000   12.00000  793.99930   91.51788  638.00000 
##    0.95LCL    0.95UCL 
##  464.00000         NA
# Question 2
# Create the Kaplan Meier plot that estimates the true survival curve.
# see this link for info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/

# basic KM plot
plot(fit.km)

# better KM plot
ggsurvplot(fit.km, data = ovarian2, 
           risk.table = TRUE, 
           surv.median.line = "hv")

# Question 3 & 4
# Q3 - Using the output from Question 1 and 2, what is the estimated 
# probability that a patient with advanced ovarian carcinoma will 
# survive at least 2 years? 

# Q4 - Using the output from Question 1 and 2, what is the 
# estimated probability ovarian carcinoma will die before 6 months?

# - futime is recorded in days
# - 2 years = 365*2 = 730 days
# - 6 months = 30*6 = 180 days

summary(fit.km)
## Call: survfit(formula = Surv(futime, fustat) ~ 1, data = ovarian2)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    59     26       1    0.962  0.0377        0.890        1.000
##   115     25       1    0.923  0.0523        0.826        1.000
##   156     24       1    0.885  0.0627        0.770        1.000
##   268     23       1    0.846  0.0708        0.718        0.997
##   329     22       1    0.808  0.0773        0.670        0.974
##   353     21       1    0.769  0.0826        0.623        0.949
##   365     20       1    0.731  0.0870        0.579        0.923
##   431     17       1    0.688  0.0919        0.529        0.894
##   464     15       1    0.642  0.0965        0.478        0.862
##   475     14       1    0.596  0.0999        0.429        0.828
##   563     12       1    0.546  0.1032        0.377        0.791
##   638     11       1    0.497  0.1051        0.328        0.752
# - Output from summary(fit)
# > summary(fit)

# 
# time n.risk n.event survival std.err
#  59     26       1    0.962  0.0377
# 115     25       1    0.923  0.0523
# 156     24       1    0.885  0.0627
# 268     23       1    0.846  0.0708
# 329     22       1    0.808  0.0773
# 353     21       1    0.769  0.0826
# 365     20       1    0.731  0.0870
# 431     17       1    0.688  0.0919
# 464     15       1    0.642  0.0965
# 475     14       1    0.596  0.0999
# 563     12       1    0.546  0.1032
# 638     11       1    0.497  0.1051     

# - Q3 Answer: 
# -- P(Y > 730 days) = P(Y > 638 days) = 0.497

# - Q4 Answer: 
# -- P(Y < 180 days) = 1 - P(Y > 180 days) = 1 - 0.885 = 0.115
# End of Program

# Making KM plots look pretty -- 
# here are ggsurvplot options
# ggsurvplot(
# fit,                     # survfit object with calculated statistics.
# data = fit,  # data used to fit survival curves. 
#        risk.table = TRUE,       # show risk table.
#        pval = TRUE,             # show p-value of log-rank test.
#        conf.int = TRUE,         # show confidence intervals for 
#                                 # point estimaes of survival curves.
#        xlim = c(0,2000),        # present narrower X axis, but not affect survival estimates.
#        break.time.by = 500,     # break X axis in time intervals by 500.
#        ggtheme = theme_minimal(), # customize plot and risk table with a theme.
#        risk.table.y.text.col = T, # colour risk table text annotations.
#        risk.table.y.text = FALSE # show bars instead of names in text annotations
#                                 # in legend of risk table
#        risk.table.col = "strata", # Change risk table color by groups
#        linetype = "strata", # Change line type by groups
#        surv.median.line = "hv", # Specify median survival