# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 5 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Motivating Example: CGD Data 
# Data are from a placebo controlled trial of gamma interferon 
# in chronic granulotomous disease (CGD). Outcome of interest 
# is time to first serious infection observed through end of 
# study for each patient.

# Data Set: cgd

# Data Dictionary: 
# (1)  id:           subject identifiction number
# (2)  treat:        placebo or gamma interferon 
#                    (1 = gamma; 0 = placebo)
# (3)  sex:          sex (1 = male; 0 = female)
# (4)  age:          age in years, at study entry
# (5)  height:       height in cm at study entry
# (6)  weight:       weight in kg at study entry
# (7)  inherit:      pattern of inheritance 
#                    (1 = x-linked; 0 = autosomal)
# (8)  propylac:     use of prophylactic antibiotics 
#                    at study entry (1 = yes; 0 = no)
# (9)  tstop:        time to fist serious infection or censoring 
# (10) status:       binary indicator of event or censoring 
#                    (1 = event, 0 = censor)

## Download and load the data file = cgd
load(url("http://www.duke.edu/~sgrambow/crp241data/cgd.RData"))
# Install and load package for Survival Analysis 
# Recall: You only need to install the package ONCE, 
# but you must load the package EVERY time you want to use it!
# install.packages('survival')
# we need the survival package for performing
# survival analyses

library(survival)
## 
## Attaching package: 'survival'
## The following object is masked _by_ '.GlobalEnv':
## 
##     cgd
# install.packages('survminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot

library(survminer)
## Warning: package 'survminer' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
## Loading required package: ggpubr
# Create Kaplan Meier plot of survival curve by treatment arm
fit.km <- survfit(Surv(tstop, status) ~ treat, data=cgd)

# - Print table of survival estimates
# -- Default ouput only shows UNIQUE EVENT times 
summary(fit.km)
## Call: survfit(formula = Surv(tstop, status) ~ treat, data = cgd)
## 
##                 treat=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     4     65       1    0.985  0.0153        0.955        1.000
##     6     64       1    0.969  0.0214        0.928        1.000
##     8     63       1    0.954  0.0260        0.904        1.000
##    11     62       1    0.938  0.0298        0.882        0.999
##    14     61       1    0.923  0.0331        0.861        0.990
##    18     60       1    0.908  0.0359        0.840        0.981
##    19     59       1    0.892  0.0384        0.820        0.971
##    23     58       1    0.877  0.0407        0.801        0.961
##    52     57       1    0.862  0.0428        0.782        0.950
##    57     56       1    0.846  0.0448        0.763        0.939
##    67     55       1    0.831  0.0465        0.744        0.927
##    91     54       1    0.815  0.0481        0.726        0.915
##    99     51       1    0.799  0.0498        0.708        0.903
##   104     50       1    0.783  0.0513        0.689        0.891
##   120     49       1    0.767  0.0527        0.671        0.878
##   146     48       1    0.751  0.0539        0.653        0.865
##   168     47       1    0.735  0.0551        0.635        0.852
##   175     46       1    0.719  0.0562        0.617        0.838
##   206     42       1    0.702  0.0574        0.598        0.824
##   211     40       1    0.685  0.0586        0.579        0.810
##   226     38       1    0.667  0.0598        0.559        0.795
##   236     36       1    0.648  0.0609        0.539        0.779
##   246     34       1    0.629  0.0620        0.519        0.763
##   264     30       1    0.608  0.0634        0.496        0.746
##   280     20       1    0.578  0.0671        0.460        0.726
##   292     17       1    0.544  0.0713        0.421        0.703
##   294     15       1    0.508  0.0752        0.380        0.678
##   304     12       1    0.465  0.0799        0.332        0.651
##   318      7       1    0.399  0.0921        0.254        0.627
##   334      4       1    0.299  0.1106        0.145        0.617
## 
##                 treat=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    65     63       1    0.984  0.0157        0.954        1.000
##    82     62       1    0.968  0.0221        0.926        1.000
##   113     61       1    0.952  0.0268        0.901        1.000
##   118     60       1    0.937  0.0307        0.878        0.999
##   146     59       1    0.921  0.0341        0.856        0.990
##   165     57       1    0.904  0.0371        0.835        0.980
##   167     56       1    0.888  0.0398        0.814        0.970
##   187     54       1    0.872  0.0423        0.793        0.959
##   207     50       1    0.854  0.0449        0.771        0.947
##   219     48       1    0.837  0.0474        0.749        0.935
##   265     42       1    0.817  0.0503        0.724        0.921
##   267     40       1    0.796  0.0530        0.699        0.907
##   274     33       1    0.772  0.0566        0.669        0.892
##   373      6       1    0.643  0.1266        0.438        0.946
# -- Have to ask R to show censored event times
summary(fit.km,censor=TRUE)
## Call: survfit(formula = Surv(tstop, status) ~ treat, data = cgd)
## 
##                 treat=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     4     65       1    0.985  0.0153        0.955        1.000
##     6     64       1    0.969  0.0214        0.928        1.000
##     8     63       1    0.954  0.0260        0.904        1.000
##    11     62       1    0.938  0.0298        0.882        0.999
##    14     61       1    0.923  0.0331        0.861        0.990
##    18     60       1    0.908  0.0359        0.840        0.981
##    19     59       1    0.892  0.0384        0.820        0.971
##    23     58       1    0.877  0.0407        0.801        0.961
##    52     57       1    0.862  0.0428        0.782        0.950
##    57     56       1    0.846  0.0448        0.763        0.939
##    67     55       1    0.831  0.0465        0.744        0.927
##    91     54       1    0.815  0.0481        0.726        0.915
##    99     51       1    0.799  0.0498        0.708        0.903
##   104     50       1    0.783  0.0513        0.689        0.891
##   120     49       1    0.767  0.0527        0.671        0.878
##   146     48       1    0.751  0.0539        0.653        0.865
##   168     47       1    0.735  0.0551        0.635        0.852
##   175     46       1    0.719  0.0562        0.617        0.838
##   192     45       0    0.719  0.0562        0.617        0.838
##   198     44       0    0.719  0.0562        0.617        0.838
##   203     43       0    0.719  0.0562        0.617        0.838
##   206     42       1    0.702  0.0574        0.598        0.824
##   207     41       0    0.702  0.0574        0.598        0.824
##   211     40       1    0.685  0.0586        0.579        0.810
##   213     39       0    0.685  0.0586        0.579        0.810
##   226     38       1    0.667  0.0598        0.559        0.795
##   227     37       0    0.667  0.0598        0.559        0.795
##   236     36       1    0.648  0.0609        0.539        0.779
##   245     35       0    0.648  0.0609        0.539        0.779
##   246     34       1    0.629  0.0620        0.519        0.763
##   252     33       0    0.629  0.0620        0.519        0.763
##   255     32       0    0.629  0.0620        0.519        0.763
##   261     31       0    0.629  0.0620        0.519        0.763
##   264     30       1    0.608  0.0634        0.496        0.746
##   269     28       0    0.608  0.0634        0.496        0.746
##   270     26       0    0.608  0.0634        0.496        0.746
##   271     25       0    0.608  0.0634        0.496        0.746
##   273     24       0    0.608  0.0634        0.496        0.746
##   274     23       0    0.608  0.0634        0.496        0.746
##   276     22       0    0.608  0.0634        0.496        0.746
##   278     21       0    0.608  0.0634        0.496        0.746
##   280     20       1    0.578  0.0671        0.460        0.726
##   281     19       0    0.578  0.0671        0.460        0.726
##   288     18       0    0.578  0.0671        0.460        0.726
##   292     17       1    0.544  0.0713        0.421        0.703
##   293     16       0    0.544  0.0713        0.421        0.703
##   294     15       1    0.508  0.0752        0.380        0.678
##   300     13       0    0.508  0.0752        0.380        0.678
##   304     12       1    0.465  0.0799        0.332        0.651
##   306     11       0    0.465  0.0799        0.332        0.651
##   308     10       0    0.465  0.0799        0.332        0.651
##   316      9       0    0.465  0.0799        0.332        0.651
##   318      7       1    0.399  0.0921        0.254        0.627
##   329      6       0    0.399  0.0921        0.254        0.627
##   330      5       0    0.399  0.0921        0.254        0.627
##   334      4       1    0.299  0.1106        0.145        0.617
##   343      3       0    0.299  0.1106        0.145        0.617
##   357      2       0    0.299  0.1106        0.145        0.617
##   365      1       0    0.299  0.1106        0.145        0.617
## 
##                 treat=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    65     63       1    0.984  0.0157        0.954        1.000
##    82     62       1    0.968  0.0221        0.926        1.000
##   113     61       1    0.952  0.0268        0.901        1.000
##   118     60       1    0.937  0.0307        0.878        0.999
##   146     59       1    0.921  0.0341        0.856        0.990
##   160     58       0    0.921  0.0341        0.856        0.990
##   165     57       1    0.904  0.0371        0.835        0.980
##   167     56       1    0.888  0.0398        0.814        0.970
##   185     55       0    0.888  0.0398        0.814        0.970
##   187     54       1    0.872  0.0423        0.793        0.959
##   197     53       0    0.872  0.0423        0.793        0.959
##   199     51       0    0.872  0.0423        0.793        0.959
##   207     50       1    0.854  0.0449        0.771        0.947
##   217     49       0    0.854  0.0449        0.771        0.947
##   219     48       1    0.837  0.0474        0.749        0.935
##   243     47       0    0.837  0.0474        0.749        0.935
##   254     46       0    0.837  0.0474        0.749        0.935
##   255     45       0    0.837  0.0474        0.749        0.935
##   259     44       0    0.837  0.0474        0.749        0.935
##   263     43       0    0.837  0.0474        0.749        0.935
##   265     42       1    0.817  0.0503        0.724        0.921
##   267     40       1    0.796  0.0530        0.699        0.907
##   269     39       0    0.796  0.0530        0.699        0.907
##   270     37       0    0.796  0.0530        0.699        0.907
##   271     36       0    0.796  0.0530        0.699        0.907
##   273     35       0    0.796  0.0530        0.699        0.907
##   274     33       1    0.772  0.0566        0.669        0.892
##   276     32       0    0.772  0.0566        0.669        0.892
##   277     31       0    0.772  0.0566        0.669        0.892
##   279     30       0    0.772  0.0566        0.669        0.892
##   284     29       0    0.772  0.0566        0.669        0.892
##   286     28       0    0.772  0.0566        0.669        0.892
##   293     27       0    0.772  0.0566        0.669        0.892
##   294     25       0    0.772  0.0566        0.669        0.892
##   297     24       0    0.772  0.0566        0.669        0.892
##   303     23       0    0.772  0.0566        0.669        0.892
##   304     22       0    0.772  0.0566        0.669        0.892
##   315     21       0    0.772  0.0566        0.669        0.892
##   318     20       0    0.772  0.0566        0.669        0.892
##   327     18       0    0.772  0.0566        0.669        0.892
##   331     17       0    0.772  0.0566        0.669        0.892
##   334     16       0    0.772  0.0566        0.669        0.892
##   335     15       0    0.772  0.0566        0.669        0.892
##   336     14       0    0.772  0.0566        0.669        0.892
##   337     13       0    0.772  0.0566        0.669        0.892
##   339     12       0    0.772  0.0566        0.669        0.892
##   350     11       0    0.772  0.0566        0.669        0.892
##   360     10       0    0.772  0.0566        0.669        0.892
##   363      9       0    0.772  0.0566        0.669        0.892
##   364      8       0    0.772  0.0566        0.669        0.892
##   371      7       0    0.772  0.0566        0.669        0.892
##   373      6       1    0.643  0.1266        0.438        0.946
##   376      4       0    0.643  0.1266        0.438        0.946
##   382      3       0    0.643  0.1266        0.438        0.946
##   388      2       0    0.643  0.1266        0.438        0.946
# generates median and 95% CI for median
summary(fit.km)$table 
##         records n.max n.start events   *rmean *se(rmean) median 0.95LCL 0.95UCL
## treat=0      65    65      65     30 253.5730   16.93893    304     264      NA
## treat=1      63    63      63     14 331.8794   11.28660     NA     373      NA
# Create the Kaplan Meier plot that estimates 
# the true survival curve. see this link for 
# info on plot options:
# https://cran.r-project.org/web/packages/survminer/vignettes/Informative_Survival_Plots.html
# https://www.r-bloggers.com/survival-analysis-basics/

ggsurvplot(fit.km, data=cgd, ggtheme = theme_minimal(),
           risk.table = TRUE)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

# Compare survival curves by treatment arm using log-rank test

survdiff(Surv(tstop, status) ~ treat, data=cgd)
## Call:
## survdiff(formula = Surv(tstop, status) ~ treat, data = cgd)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## treat=0 65       30     18.9      6.48      11.7
## treat=1 63       14     25.1      4.89      11.7
## 
##  Chisq= 11.7  on 1 degrees of freedom, p= 6e-04
# Compare survival curves by treatment arm using Cox PH regression 

mfit.cgd <- coxph(Surv(tstop, status) ~ treat, data=cgd)
summary(mfit.cgd)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat, data = cgd)
## 
##   n= 128, number of events= 44 
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)   
## treat -1.0940    0.3349   0.3348 -3.268  0.00108 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## treat    0.3349      2.986    0.1737    0.6454
## 
## Concordance= 0.621  (se = 0.036 )
## Likelihood ratio test= 11.8  on 1 df,   p=6e-04
## Wald test            = 10.68  on 1 df,   p=0.001
## Score (logrank) test = 11.74  on 1 df,   p=6e-04
# Answers to Questions!!!!

##############################################
# Question #1:
# What would the hazard ratio and its 95% 
# confidence interval be for the Placebo 
# Arm vs. Gamma Interferon Arm?
##############################################

summary(mfit.cgd)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat, data = cgd)
## 
##   n= 128, number of events= 44 
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)   
## treat -1.0940    0.3349   0.3348 -3.268  0.00108 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## treat    0.3349      2.986    0.1737    0.6454
## 
## Concordance= 0.621  (se = 0.036 )
## Likelihood ratio test= 11.8  on 1 df,   p=6e-04
## Wald test            = 10.68  on 1 df,   p=0.001
## Score (logrank) test = 11.74  on 1 df,   p=6e-04
# - Output from summary(mfit.cgd)
# > summary(mfit.cgd)
# Call:
# coxph(formula = Surv(tstop, status) ~ treat, data = cgd)

# n= 128, number of events= 44 

#          coef exp(coef) se(coef)      z Pr(>|z|)   
# treat -1.0940    0.3349   0.3348 -3.268  0.00108 **
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#       exp(coef) exp(-coef) lower .95 upper .95
# treat    0.3349      2.986    0.1737    0.6454

# Concordance= 0.621  (se = 0.036 )
# Likelihood ratio test= 11.8  on 1 df,   p=6e-04
# Wald test            = 10.68  on 1 df,   p=0.001
# Score (logrank) test = 11.74  on 1 df,   p=6e-04

# We can hand calculate the 
# HR for placebo vs. gamma as follows:

1/.3349 
## [1] 2.985966
# [1] 2.985966
# Note that while you can calculate this manually as 
# we have done here, this number is also included
# in the default output from the coxph function

# The confidence intervals for placebo vs gamma
# are not part of default output but they are 
# easy to calculate as shown below:
# - 95% CI 

1/.1737
## [1] 5.757052
1/.6454
## [1] 1.549427
# > 1/.1737
# [1] 5.757052
# > 1/.6454
# [1] 1.549427

# So we have the point estimate and
# confidence limits:
# 2.99 [95% CI: 1.55, 5.76]

# these can also be calculated using
# the confint() function as shown below:

exp(-1*confint(mfit.cgd))
##          2.5 %   97.5 %
## treat 5.755713 1.549377
# output yields
#          2.5 %   97.5 %
# treat 5.755713 1.549377
# NOTE that they are not labeled quite 
# right so we need to flip them 
# so lower LCL=1.55 and UCL=5.76

##############################################
# Question #2:
# Fit a Cox PH regression model that examines 
# the association between time to infection and
# sex. 
# Is there evidence of an association?
# What does the reported hazard ratio estimate?
##############################################

mfit2.cgd <- coxph(Surv(tstop, status) ~ sex, data=cgd)
summary(mfit2.cgd)
## Call:
## coxph(formula = Surv(tstop, status) ~ sex, data = cgd)
## 
##   n= 128, number of events= 44 
## 
##       coef exp(coef) se(coef)     z Pr(>|z|)
## sex 0.2127    1.2370   0.4123 0.516    0.606
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## sex     1.237     0.8084    0.5514     2.775
## 
## Concordance= 0.507  (se = 0.031 )
## Likelihood ratio test= 0.28  on 1 df,   p=0.6
## Wald test            = 0.27  on 1 df,   p=0.6
## Score (logrank) test = 0.27  on 1 df,   p=0.6
#       coef exp(coef) se(coef)     z Pr(>|z|)
# sex 0.2127    1.2370   0.4123 0.516    0.606    
# p=0.606 <--- No Association!!!
# the p-value is the key quantity to answer the question
# about statistical significance of the predictor
# Hazard ratio for sex is 1.24 suggesting that
# hazard increases by 24% among males vs females. 
##############################################
# Question #3:
# Based on the model fit in Question 2, what 
# would the Kaplan Meier figure stratified by 
# sex look like for this data set?
##############################################
# - The lack of association would be evidenced by overlapping curves 
# - Confirm by creating the KM plot ... 
mfit.sex <- survfit(Surv(tstop, status) ~ sex, data=cgd)
ggsurvplot(mfit.sex, data=cgd, ggtheme=theme_minimal(),
           risk.table = TRUE)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

##############################################
# Question #4:
# Fit a Cox PH regression model that examines 
# the association between time to infection 
# and age. Is there evidence of an association?
# What does the reported hazard ratio estimate?
##############################################
mfit.age <- coxph(Surv(tstop, status) ~ age, data=cgd)
summary(mfit.age)
## Call:
## coxph(formula = Surv(tstop, status) ~ age, data = cgd)
## 
##   n= 128, number of events= 44 
## 
##         coef exp(coef) se(coef)      z Pr(>|z|)
## age -0.02121   0.97901  0.01682 -1.261    0.207
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     0.979      1.021    0.9473     1.012
## 
## Concordance= 0.57  (se = 0.048 )
## Likelihood ratio test= 1.69  on 1 df,   p=0.2
## Wald test            = 1.59  on 1 df,   p=0.2
## Score (logrank) test = 1.6  on 1 df,   p=0.2
#         coef exp(coef) se(coef)      z Pr(>|z|)
# age -0.02121   0.97901  0.01682 -1.261    0.207   

# p-value = 0.207 <--- No Association
# Hazard ratio for age is 0.979
# Interpretation is that the hazard decreases as 
# age increases.  For a unit increase in age, the 
# hazard will decrease by a multiplicative factor of
# 0.98.  We can also see this by examining that on the 
# log hazard scale the estimate is negative.

##############################################
# Question #5:
# Based on the model fit in Question 4, is it 
# possible to estimate the hazard ratio for a 
# 5 year change in age?  Is there evidence of 
# an association between time to infection and 
# a 5 year change in age?
##############################################
# multiply the raw coefficient by 5 and then
# exponentiate -- can do the same with the CIs
# this rescaling doesn't change statistical 
# significance of the variable in the model!
exp(5*-0.02121)
## [1] 0.8993797
# [1] 0.8993797
# Point Estimate of HR = 0.90
# For a 5 unit increase in age, the hazard 
# will decrease by a multiplicative factor of
# 0.90.
# NOTE that you can also calculate using model object
exp(5*mfit.age$coefficients)
##      age 
## 0.899374
# yields 0.899374
# For CI
exp(5*confint(mfit.age))
##         2.5 % 97.5 %
## age 0.7627288 1.0605
# yields
#         2.5 % 97.5 %
# age 0.7627288 1.0605
#
# HR = 0.90 [95% CI: 0.76, 1.06]
# End of Program
# Making KM plots look pretty -- 
# here are ggsurvplot options
# ggsurvplot(
# fit,                     # survfit object with calculated statistics.
# data = fit,  # data used to fit survival curves. 
#        risk.table = TRUE,       # show risk table.
#        pval = TRUE,             # show p-value of log-rank test.
#        conf.int = TRUE,         # show confidence intervals for 
#                                 # point estimaes of survival curves.
#        xlim = c(0,2000),        # present narrower X axis, but not affect survival estimates.
#        break.time.by = 500,     # break X axis in time intervals by 500.
#        ggtheme = theme_minimal(), # customize plot and risk table with a theme.
#        risk.table.y.text.col = T, # colour risk table text annotations.
#        risk.table.y.text = FALSE # show bars instead of names in text annotations
#                                 # in legend of risk table
#        risk.table.col = "strata", # Change risk table color by groups
#        linetype = "strata", # Change line type by groups
#        surv.median.line = "hv", # Specify median survival