# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#  BMJ Statistics Notes
#     DATA SETS      
#    for survival analysis
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# 
# Martin Bland, J., & Altman, D. G. (1998). 
# Survival probabilities (the Kaplan-Meier method). 
# British Medical Journal, 317(7172), 1572.
# URL to PDF
# https://www.bmj.com/content/bmj/317/7172/1572.full.pdf

# Here is the dataset from the article along 
# with code to replicate KM plot shown in article.

# Data Set: bmj_km

# Data Dictionary: 
# (1)  months     time to conception or censoring in months
# (2)  event      censoring indicator (0=censor; 1=conception)  
## load the data file = bmj_km
load(url("http://www.duke.edu/~sgrambow/crp241data/bmj_km.RData"))
# Inspect and summarize the data
# Inspect the raw data
View(bmj_km)

# Examine structure of the data
str(bmj_km)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 38 obs. of  2 variables:
##  $ months: num  1 1 1 1 1 1 2 2 2 2 ...
##  $ event : num  1 1 1 1 1 1 1 1 1 1 ...
# both variables are numeric
 
# numerical summary of the data
summary(bmj_km)
##      months           event       
##  Min.   : 1.000   Min.   :0.0000  
##  1st Qu.: 2.000   1st Qu.:0.0000  
##  Median : 4.000   Median :1.0000  
##  Mean   : 6.316   Mean   :0.6579  
##  3rd Qu.: 9.000   3rd Qu.:1.0000  
##  Max.   :24.000   Max.   :1.0000
# median of months variable = 4
# 66% of subject experienced event of conception
# = 25/38 participants
# obtain KM table and plot
# Obtain survival estimates using survfit() function
library(survival)
fit.km <- survfit(Surv(months, event) ~1, data=bmj_km)

# - Print table of survival estimates
# -- Default output only shows UNIQUE EVENT times 
summary(fit.km)
## Call: survfit(formula = Surv(months, event) ~ 1, data = bmj_km)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     38       6    0.842  0.0592       0.7338        0.966
##     2     32       5    0.711  0.0736       0.5800        0.870
##     3     26       3    0.629  0.0789       0.4915        0.804
##     4     22       3    0.543  0.0822       0.4035        0.730
##     6     18       2    0.483  0.0834       0.3439        0.677
##     9     12       3    0.362  0.0869       0.2261        0.579
##    10      6       1    0.302  0.0910       0.1670        0.545
##    13      4       1    0.226  0.0944       0.0998        0.513
##    16      3       1    0.151  0.0880       0.0480        0.474
# -- Have to ask R to show censored event times
summary(fit.km,censor=TRUE)
## Call: survfit(formula = Surv(months, event) ~ 1, data = bmj_km)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     38       6    0.842  0.0592       0.7338        0.966
##     2     32       5    0.711  0.0736       0.5800        0.870
##     3     26       3    0.629  0.0789       0.4915        0.804
##     4     22       3    0.543  0.0822       0.4035        0.730
##     6     18       2    0.483  0.0834       0.3439        0.677
##     7     16       0    0.483  0.0834       0.3439        0.677
##     8     14       0    0.483  0.0834       0.3439        0.677
##     9     12       3    0.362  0.0869       0.2261        0.579
##    10      6       1    0.302  0.0910       0.1670        0.545
##    11      5       0    0.302  0.0910       0.1670        0.545
##    13      4       1    0.226  0.0944       0.0998        0.513
##    16      3       1    0.151  0.0880       0.0480        0.474
##    24      2       0    0.151  0.0880       0.0480        0.474
# generates median and 95% CI for median
summary(fit.km)$table 
##    records      n.max    n.start     events     *rmean *se(rmean)     median 
##  38.000000  38.000000  38.000000  25.000000   8.865833   1.497789   6.000000 
##    0.95LCL    0.95UCL 
##   3.000000  16.000000
# Question 2
# Create the Kaplan Meier plot that estimates the true 
# survival curve. see this link for info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/

# KM plot
library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
ggsurvplot(fit.km, data = bmj_km, 
           risk.table = TRUE, 
           surv.median.line = "hv",
           title="Survival curve showing probability of not conceiving ",
           xlab = "time in months")