# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# BMJ Statistics Notes
# DATA SETS
# for survival analysis
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Martin Bland, J., & Altman, D. G. (1998).
# Survival probabilities (the Kaplan-Meier method).
# British Medical Journal, 317(7172), 1572.
# URL to PDF
# https://www.bmj.com/content/bmj/317/7172/1572.full.pdf
# Here is the dataset from the article along
# with code to replicate KM plot shown in article.
# Data Set: bmj_km
# Data Dictionary:
# (1) months time to conception or censoring in months
# (2) event censoring indicator (0=censor; 1=conception)
## load the data file = bmj_km
load(url("http://www.duke.edu/~sgrambow/crp241data/bmj_km.RData"))
# Inspect and summarize the data
# Inspect the raw data
View(bmj_km)
# Examine structure of the data
str(bmj_km)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 38 obs. of 2 variables:
## $ months: num 1 1 1 1 1 1 2 2 2 2 ...
## $ event : num 1 1 1 1 1 1 1 1 1 1 ...
# both variables are numeric
# numerical summary of the data
summary(bmj_km)
## months event
## Min. : 1.000 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.:0.0000
## Median : 4.000 Median :1.0000
## Mean : 6.316 Mean :0.6579
## 3rd Qu.: 9.000 3rd Qu.:1.0000
## Max. :24.000 Max. :1.0000
# median of months variable = 4
# 66% of subject experienced event of conception
# = 25/38 participants
# obtain KM table and plot
# Obtain survival estimates using survfit() function
library(survival)
fit.km <- survfit(Surv(months, event) ~1, data=bmj_km)
# - Print table of survival estimates
# -- Default output only shows UNIQUE EVENT times
summary(fit.km)
## Call: survfit(formula = Surv(months, event) ~ 1, data = bmj_km)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 38 6 0.842 0.0592 0.7338 0.966
## 2 32 5 0.711 0.0736 0.5800 0.870
## 3 26 3 0.629 0.0789 0.4915 0.804
## 4 22 3 0.543 0.0822 0.4035 0.730
## 6 18 2 0.483 0.0834 0.3439 0.677
## 9 12 3 0.362 0.0869 0.2261 0.579
## 10 6 1 0.302 0.0910 0.1670 0.545
## 13 4 1 0.226 0.0944 0.0998 0.513
## 16 3 1 0.151 0.0880 0.0480 0.474
# -- Have to ask R to show censored event times
summary(fit.km,censor=TRUE)
## Call: survfit(formula = Surv(months, event) ~ 1, data = bmj_km)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 38 6 0.842 0.0592 0.7338 0.966
## 2 32 5 0.711 0.0736 0.5800 0.870
## 3 26 3 0.629 0.0789 0.4915 0.804
## 4 22 3 0.543 0.0822 0.4035 0.730
## 6 18 2 0.483 0.0834 0.3439 0.677
## 7 16 0 0.483 0.0834 0.3439 0.677
## 8 14 0 0.483 0.0834 0.3439 0.677
## 9 12 3 0.362 0.0869 0.2261 0.579
## 10 6 1 0.302 0.0910 0.1670 0.545
## 11 5 0 0.302 0.0910 0.1670 0.545
## 13 4 1 0.226 0.0944 0.0998 0.513
## 16 3 1 0.151 0.0880 0.0480 0.474
## 24 2 0 0.151 0.0880 0.0480 0.474
# generates median and 95% CI for median
summary(fit.km)$table
## records n.max n.start events *rmean *se(rmean) median
## 38.000000 38.000000 38.000000 25.000000 8.865833 1.497789 6.000000
## 0.95LCL 0.95UCL
## 3.000000 16.000000
# Question 2
# Create the Kaplan Meier plot that estimates the true
# survival curve. see this link for info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/
# KM plot
library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
ggsurvplot(fit.km, data = bmj_km,
risk.table = TRUE,
surv.median.line = "hv",
title="Survival curve showing probability of not conceiving ",
xlab = "time in months")
