# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 5 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Motivating Example: CGD Data
# Data are from a placebo controlled trial of gamma interferon
# in chronic granulotomous disease (CGD). Outcome of interest
# is time to first serious infection observed through end of
# study for each patient.
# Data Set: cgd
# Data Dictionary:
# (1) id: subject identifiction number
# (2) treat: placebo or gamma interferon
# (1 = gamma; 0 = placebo)
# (3) sex: sex (1 = male; 0 = female)
# (4) age: age in years, at study entry
# (5) height: height in cm at study entry
# (6) weight: weight in kg at study entry
# (7) inherit: pattern of inheritance
# (1 = x-linked; 0 = autosomal)
# (8) propylac: use of prophylactic antibiotics
# at study entry (1 = yes; 0 = no)
# (9) tstop: time to fist serious infection or censoring
# (10) status: binary indicator of event or censoring
# (1 = event, 0 = censor)
## Download and load the data file = cgd
load(url("http://www.duke.edu/~sgrambow/crp241data/cgd.RData"))
# Install and load package for Survival Analysis
# Recall: You only need to install the package ONCE,
# but you must load the package EVERY time you want to use it!
# install.packages('survival')
# we need the survival package for performing
# survival analyses
library(survival)
##
## Attaching package: 'survival'
## The following object is masked _by_ '.GlobalEnv':
##
## cgd
# install.packages('survminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot
library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
# Create Kaplan Meier plot of survival curve by treatment arm
fit.km <- survfit(Surv(tstop, status) ~ treat, data=cgd)
# - Print table of survival estimates
# -- Default ouput only shows UNIQUE EVENT times
summary(fit.km)
## Call: survfit(formula = Surv(tstop, status) ~ treat, data = cgd)
##
## treat=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 4 65 1 0.985 0.0153 0.955 1.000
## 6 64 1 0.969 0.0214 0.928 1.000
## 8 63 1 0.954 0.0260 0.904 1.000
## 11 62 1 0.938 0.0298 0.882 0.999
## 14 61 1 0.923 0.0331 0.861 0.990
## 18 60 1 0.908 0.0359 0.840 0.981
## 19 59 1 0.892 0.0384 0.820 0.971
## 23 58 1 0.877 0.0407 0.801 0.961
## 52 57 1 0.862 0.0428 0.782 0.950
## 57 56 1 0.846 0.0448 0.763 0.939
## 67 55 1 0.831 0.0465 0.744 0.927
## 91 54 1 0.815 0.0481 0.726 0.915
## 99 51 1 0.799 0.0498 0.708 0.903
## 104 50 1 0.783 0.0513 0.689 0.891
## 120 49 1 0.767 0.0527 0.671 0.878
## 146 48 1 0.751 0.0539 0.653 0.865
## 168 47 1 0.735 0.0551 0.635 0.852
## 175 46 1 0.719 0.0562 0.617 0.838
## 206 42 1 0.702 0.0574 0.598 0.824
## 211 40 1 0.685 0.0586 0.579 0.810
## 226 38 1 0.667 0.0598 0.559 0.795
## 236 36 1 0.648 0.0609 0.539 0.779
## 246 34 1 0.629 0.0620 0.519 0.763
## 264 30 1 0.608 0.0634 0.496 0.746
## 280 20 1 0.578 0.0671 0.460 0.726
## 292 17 1 0.544 0.0713 0.421 0.703
## 294 15 1 0.508 0.0752 0.380 0.678
## 304 12 1 0.465 0.0799 0.332 0.651
## 318 7 1 0.399 0.0921 0.254 0.627
## 334 4 1 0.299 0.1106 0.145 0.617
##
## treat=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 65 63 1 0.984 0.0157 0.954 1.000
## 82 62 1 0.968 0.0221 0.926 1.000
## 113 61 1 0.952 0.0268 0.901 1.000
## 118 60 1 0.937 0.0307 0.878 0.999
## 146 59 1 0.921 0.0341 0.856 0.990
## 165 57 1 0.904 0.0371 0.835 0.980
## 167 56 1 0.888 0.0398 0.814 0.970
## 187 54 1 0.872 0.0423 0.793 0.959
## 207 50 1 0.854 0.0449 0.771 0.947
## 219 48 1 0.837 0.0474 0.749 0.935
## 265 42 1 0.817 0.0503 0.724 0.921
## 267 40 1 0.796 0.0530 0.699 0.907
## 274 33 1 0.772 0.0566 0.669 0.892
## 373 6 1 0.643 0.1266 0.438 0.946
# -- Have to ask R to show censored event times
summary(fit.km,censor=TRUE)
## Call: survfit(formula = Surv(tstop, status) ~ treat, data = cgd)
##
## treat=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 4 65 1 0.985 0.0153 0.955 1.000
## 6 64 1 0.969 0.0214 0.928 1.000
## 8 63 1 0.954 0.0260 0.904 1.000
## 11 62 1 0.938 0.0298 0.882 0.999
## 14 61 1 0.923 0.0331 0.861 0.990
## 18 60 1 0.908 0.0359 0.840 0.981
## 19 59 1 0.892 0.0384 0.820 0.971
## 23 58 1 0.877 0.0407 0.801 0.961
## 52 57 1 0.862 0.0428 0.782 0.950
## 57 56 1 0.846 0.0448 0.763 0.939
## 67 55 1 0.831 0.0465 0.744 0.927
## 91 54 1 0.815 0.0481 0.726 0.915
## 99 51 1 0.799 0.0498 0.708 0.903
## 104 50 1 0.783 0.0513 0.689 0.891
## 120 49 1 0.767 0.0527 0.671 0.878
## 146 48 1 0.751 0.0539 0.653 0.865
## 168 47 1 0.735 0.0551 0.635 0.852
## 175 46 1 0.719 0.0562 0.617 0.838
## 192 45 0 0.719 0.0562 0.617 0.838
## 198 44 0 0.719 0.0562 0.617 0.838
## 203 43 0 0.719 0.0562 0.617 0.838
## 206 42 1 0.702 0.0574 0.598 0.824
## 207 41 0 0.702 0.0574 0.598 0.824
## 211 40 1 0.685 0.0586 0.579 0.810
## 213 39 0 0.685 0.0586 0.579 0.810
## 226 38 1 0.667 0.0598 0.559 0.795
## 227 37 0 0.667 0.0598 0.559 0.795
## 236 36 1 0.648 0.0609 0.539 0.779
## 245 35 0 0.648 0.0609 0.539 0.779
## 246 34 1 0.629 0.0620 0.519 0.763
## 252 33 0 0.629 0.0620 0.519 0.763
## 255 32 0 0.629 0.0620 0.519 0.763
## 261 31 0 0.629 0.0620 0.519 0.763
## 264 30 1 0.608 0.0634 0.496 0.746
## 269 28 0 0.608 0.0634 0.496 0.746
## 270 26 0 0.608 0.0634 0.496 0.746
## 271 25 0 0.608 0.0634 0.496 0.746
## 273 24 0 0.608 0.0634 0.496 0.746
## 274 23 0 0.608 0.0634 0.496 0.746
## 276 22 0 0.608 0.0634 0.496 0.746
## 278 21 0 0.608 0.0634 0.496 0.746
## 280 20 1 0.578 0.0671 0.460 0.726
## 281 19 0 0.578 0.0671 0.460 0.726
## 288 18 0 0.578 0.0671 0.460 0.726
## 292 17 1 0.544 0.0713 0.421 0.703
## 293 16 0 0.544 0.0713 0.421 0.703
## 294 15 1 0.508 0.0752 0.380 0.678
## 300 13 0 0.508 0.0752 0.380 0.678
## 304 12 1 0.465 0.0799 0.332 0.651
## 306 11 0 0.465 0.0799 0.332 0.651
## 308 10 0 0.465 0.0799 0.332 0.651
## 316 9 0 0.465 0.0799 0.332 0.651
## 318 7 1 0.399 0.0921 0.254 0.627
## 329 6 0 0.399 0.0921 0.254 0.627
## 330 5 0 0.399 0.0921 0.254 0.627
## 334 4 1 0.299 0.1106 0.145 0.617
## 343 3 0 0.299 0.1106 0.145 0.617
## 357 2 0 0.299 0.1106 0.145 0.617
## 365 1 0 0.299 0.1106 0.145 0.617
##
## treat=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 65 63 1 0.984 0.0157 0.954 1.000
## 82 62 1 0.968 0.0221 0.926 1.000
## 113 61 1 0.952 0.0268 0.901 1.000
## 118 60 1 0.937 0.0307 0.878 0.999
## 146 59 1 0.921 0.0341 0.856 0.990
## 160 58 0 0.921 0.0341 0.856 0.990
## 165 57 1 0.904 0.0371 0.835 0.980
## 167 56 1 0.888 0.0398 0.814 0.970
## 185 55 0 0.888 0.0398 0.814 0.970
## 187 54 1 0.872 0.0423 0.793 0.959
## 197 53 0 0.872 0.0423 0.793 0.959
## 199 51 0 0.872 0.0423 0.793 0.959
## 207 50 1 0.854 0.0449 0.771 0.947
## 217 49 0 0.854 0.0449 0.771 0.947
## 219 48 1 0.837 0.0474 0.749 0.935
## 243 47 0 0.837 0.0474 0.749 0.935
## 254 46 0 0.837 0.0474 0.749 0.935
## 255 45 0 0.837 0.0474 0.749 0.935
## 259 44 0 0.837 0.0474 0.749 0.935
## 263 43 0 0.837 0.0474 0.749 0.935
## 265 42 1 0.817 0.0503 0.724 0.921
## 267 40 1 0.796 0.0530 0.699 0.907
## 269 39 0 0.796 0.0530 0.699 0.907
## 270 37 0 0.796 0.0530 0.699 0.907
## 271 36 0 0.796 0.0530 0.699 0.907
## 273 35 0 0.796 0.0530 0.699 0.907
## 274 33 1 0.772 0.0566 0.669 0.892
## 276 32 0 0.772 0.0566 0.669 0.892
## 277 31 0 0.772 0.0566 0.669 0.892
## 279 30 0 0.772 0.0566 0.669 0.892
## 284 29 0 0.772 0.0566 0.669 0.892
## 286 28 0 0.772 0.0566 0.669 0.892
## 293 27 0 0.772 0.0566 0.669 0.892
## 294 25 0 0.772 0.0566 0.669 0.892
## 297 24 0 0.772 0.0566 0.669 0.892
## 303 23 0 0.772 0.0566 0.669 0.892
## 304 22 0 0.772 0.0566 0.669 0.892
## 315 21 0 0.772 0.0566 0.669 0.892
## 318 20 0 0.772 0.0566 0.669 0.892
## 327 18 0 0.772 0.0566 0.669 0.892
## 331 17 0 0.772 0.0566 0.669 0.892
## 334 16 0 0.772 0.0566 0.669 0.892
## 335 15 0 0.772 0.0566 0.669 0.892
## 336 14 0 0.772 0.0566 0.669 0.892
## 337 13 0 0.772 0.0566 0.669 0.892
## 339 12 0 0.772 0.0566 0.669 0.892
## 350 11 0 0.772 0.0566 0.669 0.892
## 360 10 0 0.772 0.0566 0.669 0.892
## 363 9 0 0.772 0.0566 0.669 0.892
## 364 8 0 0.772 0.0566 0.669 0.892
## 371 7 0 0.772 0.0566 0.669 0.892
## 373 6 1 0.643 0.1266 0.438 0.946
## 376 4 0 0.643 0.1266 0.438 0.946
## 382 3 0 0.643 0.1266 0.438 0.946
## 388 2 0 0.643 0.1266 0.438 0.946
# generates median and 95% CI for median
summary(fit.km)$table
## records n.max n.start events *rmean *se(rmean) median 0.95LCL 0.95UCL
## treat=0 65 65 65 30 253.5730 16.93893 304 264 NA
## treat=1 63 63 63 14 331.8794 11.28660 NA 373 NA
# Create the Kaplan Meier plot that estimates
# the true survival curve. see this link for
# info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/
ggsurvplot(fit.km, data=cgd, ggtheme = theme_minimal(),
risk.table = TRUE)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

# Compare survival curves by treatment arm using log-rank test
survdiff(Surv(tstop, status) ~ treat, data=cgd)
## Call:
## survdiff(formula = Surv(tstop, status) ~ treat, data = cgd)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## treat=0 65 30 18.9 6.48 11.7
## treat=1 63 14 25.1 4.89 11.7
##
## Chisq= 11.7 on 1 degrees of freedom, p= 6e-04
# Compare survival curves by treatment arm using Cox PH regression
mfit.cgd <- coxph(Surv(tstop, status) ~ treat, data=cgd)
summary(mfit.cgd)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat, data = cgd)
##
## n= 128, number of events= 44
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treat -1.0940 0.3349 0.3348 -3.268 0.00108 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treat 0.3349 2.986 0.1737 0.6454
##
## Concordance= 0.621 (se = 0.036 )
## Likelihood ratio test= 11.8 on 1 df, p=6e-04
## Wald test = 10.68 on 1 df, p=0.001
## Score (logrank) test = 11.74 on 1 df, p=6e-04
# Answers to Questions!!!!
##############################################
# Question #1:
# What would the hazard ratio and its 95%
# confidence interval be for the Placebo
# Arm vs. Gamma Interferon Arm?
##############################################
summary(mfit.cgd)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat, data = cgd)
##
## n= 128, number of events= 44
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treat -1.0940 0.3349 0.3348 -3.268 0.00108 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treat 0.3349 2.986 0.1737 0.6454
##
## Concordance= 0.621 (se = 0.036 )
## Likelihood ratio test= 11.8 on 1 df, p=6e-04
## Wald test = 10.68 on 1 df, p=0.001
## Score (logrank) test = 11.74 on 1 df, p=6e-04
# - Output from summary(mfit.cgd)
# > summary(mfit.cgd)
# Call:
# coxph(formula = Surv(tstop, status) ~ treat, data = cgd)
# n= 128, number of events= 44
# coef exp(coef) se(coef) z Pr(>|z|)
# treat -1.0940 0.3349 0.3348 -3.268 0.00108 **
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# exp(coef) exp(-coef) lower .95 upper .95
# treat 0.3349 2.986 0.1737 0.6454
# Concordance= 0.621 (se = 0.036 )
# Likelihood ratio test= 11.8 on 1 df, p=6e-04
# Wald test = 10.68 on 1 df, p=0.001
# Score (logrank) test = 11.74 on 1 df, p=6e-04
# We can hand calculate the
# HR for placebo vs. gamma as follows:
1/.3349
## [1] 2.985966
# [1] 2.985966
# Note that while you can calculate this manually as
# we have done here, this number is also included
# in the default output from the coxph function
# The confidence intervals for placebo vs gamma
# are not part of default output but they are
# easy to calculate as shown below:
# - 95% CI
1/.1737
## [1] 5.757052
1/.6454
## [1] 1.549427
# > 1/.1737
# [1] 5.757052
# > 1/.6454
# [1] 1.549427
# So we have the point estimate and
# confidence limits:
# 2.99 [95% CI: 1.55, 5.76]
# these can also be calculated using
# the confint() function as shown below:
exp(-1*confint(mfit.cgd))
## 2.5 % 97.5 %
## treat 5.755713 1.549377
# output yields
# 2.5 % 97.5 %
# treat 5.755713 1.549377
# NOTE that they are not labeled quite
# right so we need to flip them
# so lower LCL=1.55 and UCL=5.76
##############################################
# Question #2:
# Fit a Cox PH regression model that examines
# the association between time to infection and
# sex.
# Is there evidence of an association?
# What does the reported hazard ratio estimate?
##############################################
mfit2.cgd <- coxph(Surv(tstop, status) ~ sex, data=cgd)
summary(mfit2.cgd)
## Call:
## coxph(formula = Surv(tstop, status) ~ sex, data = cgd)
##
## n= 128, number of events= 44
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sex 0.2127 1.2370 0.4123 0.516 0.606
##
## exp(coef) exp(-coef) lower .95 upper .95
## sex 1.237 0.8084 0.5514 2.775
##
## Concordance= 0.507 (se = 0.031 )
## Likelihood ratio test= 0.28 on 1 df, p=0.6
## Wald test = 0.27 on 1 df, p=0.6
## Score (logrank) test = 0.27 on 1 df, p=0.6
# coef exp(coef) se(coef) z Pr(>|z|)
# sex 0.2127 1.2370 0.4123 0.516 0.606
# p=0.606 <--- No Association!!!
# the p-value is the key quantity to answer the question
# about statistical significance of the predictor
# Hazard ratio for sex is 1.24 suggesting that
# hazard increases by 24% among males vs females.
##############################################
# Question #3:
# Based on the model fit in Question 2, what
# would the Kaplan Meier figure stratified by
# sex look like for this data set?
##############################################
# - The lack of association would be evidenced by overlapping curves
# - Confirm by creating the KM plot ...
mfit.sex <- survfit(Surv(tstop, status) ~ sex, data=cgd)
ggsurvplot(mfit.sex, data=cgd, ggtheme=theme_minimal(),
risk.table = TRUE)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

##############################################
# Question #4:
# Fit a Cox PH regression model that examines
# the association between time to infection
# and age. Is there evidence of an association?
# What does the reported hazard ratio estimate?
##############################################
mfit.age <- coxph(Surv(tstop, status) ~ age, data=cgd)
summary(mfit.age)
## Call:
## coxph(formula = Surv(tstop, status) ~ age, data = cgd)
##
## n= 128, number of events= 44
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age -0.02121 0.97901 0.01682 -1.261 0.207
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 0.979 1.021 0.9473 1.012
##
## Concordance= 0.57 (se = 0.048 )
## Likelihood ratio test= 1.69 on 1 df, p=0.2
## Wald test = 1.59 on 1 df, p=0.2
## Score (logrank) test = 1.6 on 1 df, p=0.2
# coef exp(coef) se(coef) z Pr(>|z|)
# age -0.02121 0.97901 0.01682 -1.261 0.207
# p-value = 0.207 <--- No Association
# Hazard ratio for age is 0.979
# Interpretation is that the hazard decreases as
# age increases. For a unit increase in age, the
# hazard will decrease by a multiplicative factor of
# 0.98. We can also see this by examining that on the
# log hazard scale the estimate is negative.
##############################################
# Question #5:
# Based on the model fit in Question 4, is it
# possible to estimate the hazard ratio for a
# 5 year change in age? Is there evidence of
# an association between time to infection and
# a 5 year change in age?
##############################################
# multiply the raw coefficient by 5 and then
# exponentiate -- can do the same with the CIs
# this rescaling doesn't change statistical
# significance of the variable in the model!
exp(5*-0.02121)
## [1] 0.8993797
# [1] 0.8993797
# Point Estimate of HR = 0.90
# For a 5 unit increase in age, the hazard
# will decrease by a multiplicative factor of
# 0.90.
# NOTE that you can also calculate using model object
exp(5*mfit.age$coefficients)
## age
## 0.899374
# yields 0.899374
# For CI
exp(5*confint(mfit.age))
## 2.5 % 97.5 %
## age 0.7627288 1.0605
# yields
# 2.5 % 97.5 %
# age 0.7627288 1.0605
#
# HR = 0.90 [95% CI: 0.76, 1.06]
# End of Program
# Making KM plots look pretty --
# here are ggsurvplot options
# ggsurvplot(
# fit, # survfit object with calculated statistics.
# data = fit, # data used to fit survival curves.
# risk.table = TRUE, # show risk table.
# pval = TRUE, # show p-value of log-rank test.
# conf.int = TRUE, # show confidence intervals for
# # point estimaes of survival curves.
# xlim = c(0,2000), # present narrower X axis, but not affect survival estimates.
# break.time.by = 500, # break X axis in time intervals by 500.
# ggtheme = theme_minimal(), # customize plot and risk table with a theme.
# risk.table.y.text.col = T, # colour risk table text annotations.
# risk.table.y.text = FALSE # show bars instead of names in text annotations
# # in legend of risk table
# risk.table.col = "strata", # Change risk table color by groups
# linetype = "strata", # Change line type by groups
# surv.median.line = "hv", # Specify median survival