# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#  BMJ Statistics Notes
#     Data from log-rank 
#         article
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Bland, J. M., & Altman, D. G. (2004). 
# The logrank test. Bmj, 328(7447), 1073.
# PDF link: https://www.bmj.com/content/bmj/328/7447/1073.full.pdf

# Here is the dataset from the article along 
# with code to replicate analysis shown in article.

# Data Set: bmj_logrank

# Data Dictionary: 
# (1)  weeks     time to death or censoring in weeks
# (2)  event     censoring indicator (0=censor; 1=death)  
# (3)  group     A = astrocytoma; G = Glioblastoma
## load the data file = bmj_km
load(url("http://www.duke.edu/~sgrambow/crp241data/bmj_logrank.RData"))
# Inspect and summarize the data
# Inspect the raw data
View(bmj_logrank)

# Examine structure of the data
str(bmj_logrank)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 51 obs. of  3 variables:
##  $ weeks: num  6 13 21 30 31 37 38 47 49 50 ...
##  $ event: num  1 1 1 1 0 1 1 0 1 1 ...
##  $ group: chr  "A" "A" "A" "A" ...
# weeks and event are numeric
# group is character
 
# numerical summary of the data
summary(bmj_logrank)
##      weeks            event           group          
##  Min.   :  6.00   Min.   :0.0000   Length:51         
##  1st Qu.: 22.50   1st Qu.:1.0000   Class :character  
##  Median : 38.00   Median :1.0000   Mode  :character  
##  Mean   : 54.59   Mean   :0.8235                     
##  3rd Qu.: 79.50   3rd Qu.:1.0000                     
##  Max.   :219.00   Max.   :1.0000
# median of weeks variable = 36
# 82% of subject experienced event of death
# obtain KM table and plot
# Obtain survival estimates using survfit() function
library(survival)
fit.logrank <- survfit(Surv(weeks, event) ~group, data=bmj_logrank)

# - Print table of survival estimates
# -- Default output only shows UNIQUE EVENT times 
summary(fit.logrank)
## Call: survfit(formula = Surv(weeks, event) ~ group, data = bmj_logrank)
## 
##                 group=A 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     6     20       1    0.950  0.0487        0.859        1.000
##    13     19       1    0.900  0.0671        0.778        1.000
##    21     18       1    0.850  0.0798        0.707        1.000
##    30     17       1    0.800  0.0894        0.643        0.996
##    37     15       1    0.747  0.0981        0.577        0.966
##    38     14       1    0.693  0.1046        0.516        0.932
##    49     12       1    0.636  0.1107        0.452        0.894
##    50     11       1    0.578  0.1147        0.392        0.853
##    63     10       1    0.520  0.1169        0.335        0.808
##    79      9       1    0.462  0.1173        0.281        0.760
##    86      5       1    0.370  0.1251        0.191        0.718
##    98      4       1    0.277  0.1233        0.116        0.663
##   202      2       1    0.139  0.1158        0.027        0.713
##   219      1       1    0.000     NaN           NA           NA
## 
##                 group=G 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    10     31       2   0.9355  0.0441      0.85288        1.000
##    12     29       1   0.9032  0.0531      0.80492        1.000
##    13     28       1   0.8710  0.0602      0.76060        0.997
##    14     27       1   0.8387  0.0661      0.71874        0.979
##    15     26       1   0.8065  0.0710      0.67871        0.958
##    16     25       1   0.7742  0.0751      0.64015        0.936
##    17     24       1   0.7419  0.0786      0.60284        0.913
##    18     23       1   0.7097  0.0815      0.56660        0.889
##    20     22       1   0.6774  0.0840      0.53132        0.864
##    24     21       2   0.6129  0.0875      0.46333        0.811
##    25     19       1   0.5806  0.0886      0.43051        0.783
##    28     18       1   0.5484  0.0894      0.39843        0.755
##    30     17       1   0.5161  0.0898      0.36706        0.726
##    33     16       1   0.4839  0.0898      0.33638        0.696
##    35     14       1   0.4493  0.0898      0.30375        0.665
##    37     13       1   0.4147  0.0893      0.27202        0.632
##    40     12       2   0.3456  0.0867      0.21134        0.565
##    46      9       1   0.3072  0.0852      0.17842        0.529
##    48      8       1   0.2688  0.0827      0.14705        0.491
##    76      6       1   0.2240  0.0802      0.11109        0.452
##    81      5       1   0.1792  0.0756      0.07838        0.410
##    82      4       1   0.1344  0.0687      0.04934        0.366
##    91      3       1   0.0896  0.0586      0.02486        0.323
##   112      2       1   0.0448  0.0432      0.00678        0.296
##   181      1       1   0.0000     NaN           NA           NA
# -- Have to ask R to show censored event times
summary(fit.logrank,censor=TRUE)
## Call: survfit(formula = Surv(weeks, event) ~ group, data = bmj_logrank)
## 
##                 group=A 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     6     20       1    0.950  0.0487        0.859        1.000
##    13     19       1    0.900  0.0671        0.778        1.000
##    21     18       1    0.850  0.0798        0.707        1.000
##    30     17       1    0.800  0.0894        0.643        0.996
##    31     16       0    0.800  0.0894        0.643        0.996
##    37     15       1    0.747  0.0981        0.577        0.966
##    38     14       1    0.693  0.1046        0.516        0.932
##    47     13       0    0.693  0.1046        0.516        0.932
##    49     12       1    0.636  0.1107        0.452        0.894
##    50     11       1    0.578  0.1147        0.392        0.853
##    63     10       1    0.520  0.1169        0.335        0.808
##    79      9       1    0.462  0.1173        0.281        0.760
##    80      8       0    0.462  0.1173        0.281        0.760
##    82      7       0    0.462  0.1173        0.281        0.760
##    86      5       1    0.370  0.1251        0.191        0.718
##    98      4       1    0.277  0.1233        0.116        0.663
##   149      3       0    0.277  0.1233        0.116        0.663
##   202      2       1    0.139  0.1158        0.027        0.713
##   219      1       1    0.000     NaN           NA           NA
## 
##                 group=G 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    10     31       2   0.9355  0.0441      0.85288        1.000
##    12     29       1   0.9032  0.0531      0.80492        1.000
##    13     28       1   0.8710  0.0602      0.76060        0.997
##    14     27       1   0.8387  0.0661      0.71874        0.979
##    15     26       1   0.8065  0.0710      0.67871        0.958
##    16     25       1   0.7742  0.0751      0.64015        0.936
##    17     24       1   0.7419  0.0786      0.60284        0.913
##    18     23       1   0.7097  0.0815      0.56660        0.889
##    20     22       1   0.6774  0.0840      0.53132        0.864
##    24     21       2   0.6129  0.0875      0.46333        0.811
##    25     19       1   0.5806  0.0886      0.43051        0.783
##    28     18       1   0.5484  0.0894      0.39843        0.755
##    30     17       1   0.5161  0.0898      0.36706        0.726
##    33     16       1   0.4839  0.0898      0.33638        0.696
##    34     15       0   0.4839  0.0898      0.33638        0.696
##    35     14       1   0.4493  0.0898      0.30375        0.665
##    37     13       1   0.4147  0.0893      0.27202        0.632
##    40     12       2   0.3456  0.0867      0.21134        0.565
##    46      9       1   0.3072  0.0852      0.17842        0.529
##    48      8       1   0.2688  0.0827      0.14705        0.491
##    70      7       0   0.2688  0.0827      0.14705        0.491
##    76      6       1   0.2240  0.0802      0.11109        0.452
##    81      5       1   0.1792  0.0756      0.07838        0.410
##    82      4       1   0.1344  0.0687      0.04934        0.366
##    91      3       1   0.0896  0.0586      0.02486        0.323
##   112      2       1   0.0448  0.0432      0.00678        0.296
##   181      1       1   0.0000     NaN           NA           NA
# generates median and 95% CI for median
summary(fit.logrank)$table 
##         records n.max n.start events   *rmean *se(rmean) median 0.95LCL 0.95UCL
## group=A      20    20      20     14 93.90089  17.786893     79      49      NA
## group=G      31    31      31     28 46.74322   7.731678     33      24      48
# Create the Kaplan Meier plot that estimates the true 
# survival curve. see this link for info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/

# KM plot
library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
ggsurvplot(fit.logrank, data = bmj_logrank, 
           risk.table = TRUE, 
           surv.median.line = "hv",
           title="Survival curve showing probability of death ",
           xlab = "time in weeks")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

# Compare survival curves by group using log-rank test

survdiff(Surv(weeks, event) ~ group, data=bmj_logrank)
## Call:
## survdiff(formula = Surv(weeks, event) ~ group, data = bmj_logrank)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## group=A 20       14     22.5      3.20       7.5
## group=G 31       28     19.5      3.69       7.5
## 
##  Chisq= 7.5  on 1 degrees of freedom, p= 0.006
# the output provides the numbers from the article
# in terms of expected events, test statistic, and 
# p-value.

# Note there are some small discrepancies in numbers, 
# likely due to roundoff.