# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Homework 2 ~
# ~ Question 1 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# These data are from a veteran's Administration lung cancer trial. Males
# with advanced inoperable lung cancer were randomized to either a standard
# or test therapy. The data include information on a number of covariates
# measuring different patient characteristics.
# Data Set: veteran
# Data Dictionary:
# trt: randomization status -- 1=standard therapy 2=test therapy
# celltype: 1=squamous, 2=smallcell, 3=adeno, 4=large
# time: survival time (from start of study to death), in days
# status: censoring status, 0=patient death not observed (so censored);
# 1=patient death was observed
# karno: Karnofsky performance score (quantifies cancer patients'
# general well-being and activities of daily life, 0 = Dead
# to 100 = Normal)
# diagtime: months from diagnosis to randomization, in months
# age: age of the patient in years
# prior: prior therapy 0=no, 10=yes
# load survival library for analysis functions
library(survival)
# load survminer package for plotting
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
# The dataset, "veteran", is part of the survival package
# we can directly access it because we just loaded the
# survival package above lets just assign it to the data
# frame name 'v.cancer'
v.cancer <- veteran
# lets change trt into a factor (trt.f) so it is
# easier to remember which arm is which
# we know to put standard as the first label because it is 1
# and test therapy is 2.
v.cancer$trt.f <- factor(v.cancer$trt, labels = c("standard", "test"))
# examine structure of data
str(v.cancer)
## 'data.frame': 137 obs. of 9 variables:
## $ trt : num 1 1 1 1 1 1 1 1 1 1 ...
## $ celltype: Factor w/ 4 levels "squamous","smallcell",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 72 411 228 126 118 10 82 110 314 100 ...
## $ status : num 1 1 1 1 1 1 1 1 1 0 ...
## $ karno : num 60 70 60 60 70 20 40 80 50 70 ...
## $ diagtime: num 7 5 3 9 11 5 10 29 18 6 ...
## $ age : num 69 64 38 63 65 49 69 68 43 70 ...
## $ prior : num 0 10 0 10 10 0 10 0 0 0 ...
## $ trt.f : Factor w/ 2 levels "standard","test": 1 1 1 1 1 1 1 1 1 1 ...
# summarize data
summary(v.cancer)
## trt celltype time status
## Min. :1.000 squamous :35 Min. : 1.0 Min. :0.0000
## 1st Qu.:1.000 smallcell:48 1st Qu.: 25.0 1st Qu.:1.0000
## Median :1.000 adeno :27 Median : 80.0 Median :1.0000
## Mean :1.496 large :27 Mean :121.6 Mean :0.9343
## 3rd Qu.:2.000 3rd Qu.:144.0 3rd Qu.:1.0000
## Max. :2.000 Max. :999.0 Max. :1.0000
## karno diagtime age prior trt.f
## Min. :10.00 Min. : 1.000 Min. :34.00 Min. : 0.00 standard:69
## 1st Qu.:40.00 1st Qu.: 3.000 1st Qu.:51.00 1st Qu.: 0.00 test :68
## Median :60.00 Median : 5.000 Median :62.00 Median : 0.00
## Mean :58.57 Mean : 8.774 Mean :58.31 Mean : 2.92
## 3rd Qu.:75.00 3rd Qu.:11.000 3rd Qu.:66.00 3rd Qu.:10.00
## Max. :99.00 Max. :87.000 Max. :81.00 Max. :10.00
# Create Kaplan Meier life table curves by treatment arm
fit.km <- survfit(Surv(time,status) ~ trt.f, data=v.cancer)
summary(fit.km,censor=FALSE)
## Call: survfit(formula = Surv(time, status) ~ trt.f, data = v.cancer)
##
## trt.f=standard
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 3 69 1 0.9855 0.0144 0.95771 1.000
## 4 68 1 0.9710 0.0202 0.93223 1.000
## 7 67 1 0.9565 0.0246 0.90959 1.000
## 8 66 2 0.9275 0.0312 0.86834 0.991
## 10 64 2 0.8986 0.0363 0.83006 0.973
## 11 62 1 0.8841 0.0385 0.81165 0.963
## 12 61 2 0.8551 0.0424 0.77592 0.942
## 13 59 1 0.8406 0.0441 0.75849 0.932
## 16 58 1 0.8261 0.0456 0.74132 0.921
## 18 57 2 0.7971 0.0484 0.70764 0.898
## 20 55 1 0.7826 0.0497 0.69109 0.886
## 21 54 1 0.7681 0.0508 0.67472 0.874
## 22 53 1 0.7536 0.0519 0.65851 0.862
## 27 51 1 0.7388 0.0529 0.64208 0.850
## 30 50 1 0.7241 0.0539 0.62580 0.838
## 31 49 1 0.7093 0.0548 0.60967 0.825
## 35 48 1 0.6945 0.0556 0.59368 0.812
## 42 47 1 0.6797 0.0563 0.57782 0.800
## 51 46 1 0.6650 0.0570 0.56209 0.787
## 52 45 1 0.6502 0.0576 0.54649 0.774
## 54 44 2 0.6206 0.0587 0.51565 0.747
## 56 42 1 0.6059 0.0591 0.50040 0.734
## 59 41 1 0.5911 0.0595 0.48526 0.720
## 63 40 1 0.5763 0.0598 0.47023 0.706
## 72 39 1 0.5615 0.0601 0.45530 0.693
## 82 38 1 0.5467 0.0603 0.44049 0.679
## 92 37 1 0.5320 0.0604 0.42577 0.665
## 95 36 1 0.5172 0.0605 0.41116 0.651
## 100 34 1 0.5020 0.0606 0.39615 0.636
## 103 32 1 0.4863 0.0607 0.38070 0.621
## 105 31 1 0.4706 0.0608 0.36537 0.606
## 110 30 1 0.4549 0.0607 0.35018 0.591
## 117 29 2 0.4235 0.0605 0.32017 0.560
## 118 27 1 0.4079 0.0602 0.30537 0.545
## 122 26 1 0.3922 0.0599 0.29069 0.529
## 126 24 1 0.3758 0.0596 0.27542 0.513
## 132 23 1 0.3595 0.0592 0.26031 0.496
## 139 22 1 0.3432 0.0587 0.24535 0.480
## 143 21 1 0.3268 0.0582 0.23057 0.463
## 144 20 1 0.3105 0.0575 0.21595 0.446
## 151 19 1 0.2941 0.0568 0.20151 0.429
## 153 18 1 0.2778 0.0559 0.18725 0.412
## 156 17 1 0.2614 0.0550 0.17317 0.395
## 162 16 2 0.2288 0.0527 0.14563 0.359
## 177 14 1 0.2124 0.0514 0.13218 0.341
## 200 12 1 0.1947 0.0501 0.11761 0.322
## 216 11 1 0.1770 0.0486 0.10340 0.303
## 228 10 1 0.1593 0.0468 0.08956 0.283
## 250 9 1 0.1416 0.0448 0.07614 0.263
## 260 8 1 0.1239 0.0426 0.06318 0.243
## 278 7 1 0.1062 0.0400 0.05076 0.222
## 287 6 1 0.0885 0.0371 0.03896 0.201
## 314 5 1 0.0708 0.0336 0.02793 0.180
## 384 4 1 0.0531 0.0295 0.01788 0.158
## 392 3 1 0.0354 0.0244 0.00917 0.137
## 411 2 1 0.0177 0.0175 0.00256 0.123
## 553 1 1 0.0000 NaN NA NA
##
## trt.f=test
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 68 2 0.9706 0.0205 0.93125 1.000
## 2 66 1 0.9559 0.0249 0.90830 1.000
## 7 65 2 0.9265 0.0317 0.86647 0.991
## 8 63 2 0.8971 0.0369 0.82766 0.972
## 13 61 1 0.8824 0.0391 0.80900 0.962
## 15 60 2 0.8529 0.0429 0.77278 0.941
## 18 58 1 0.8382 0.0447 0.75513 0.930
## 19 57 2 0.8088 0.0477 0.72056 0.908
## 20 55 1 0.7941 0.0490 0.70360 0.896
## 21 54 1 0.7794 0.0503 0.68684 0.884
## 24 53 2 0.7500 0.0525 0.65383 0.860
## 25 51 3 0.7059 0.0553 0.60548 0.823
## 29 48 1 0.6912 0.0560 0.58964 0.810
## 30 47 1 0.6765 0.0567 0.57394 0.797
## 31 46 1 0.6618 0.0574 0.55835 0.784
## 33 45 1 0.6471 0.0580 0.54289 0.771
## 36 44 1 0.6324 0.0585 0.52754 0.758
## 43 43 1 0.6176 0.0589 0.51230 0.745
## 44 42 1 0.6029 0.0593 0.49717 0.731
## 45 41 1 0.5882 0.0597 0.48216 0.718
## 48 40 1 0.5735 0.0600 0.46724 0.704
## 49 39 1 0.5588 0.0602 0.45244 0.690
## 51 38 2 0.5294 0.0605 0.42313 0.662
## 52 36 2 0.5000 0.0606 0.39423 0.634
## 53 34 1 0.4853 0.0606 0.37993 0.620
## 61 33 1 0.4706 0.0605 0.36573 0.606
## 73 32 1 0.4559 0.0604 0.35163 0.591
## 80 31 2 0.4265 0.0600 0.32373 0.562
## 84 28 1 0.4112 0.0597 0.30935 0.547
## 87 27 1 0.3960 0.0594 0.29509 0.531
## 90 25 1 0.3802 0.0591 0.28028 0.516
## 95 24 1 0.3643 0.0587 0.26560 0.500
## 99 23 2 0.3326 0.0578 0.23670 0.467
## 111 20 2 0.2994 0.0566 0.20673 0.434
## 112 18 1 0.2827 0.0558 0.19203 0.416
## 133 17 1 0.2661 0.0550 0.17754 0.399
## 140 16 1 0.2495 0.0540 0.16326 0.381
## 164 15 1 0.2329 0.0529 0.14920 0.363
## 186 14 1 0.2162 0.0517 0.13538 0.345
## 201 13 1 0.1996 0.0503 0.12181 0.327
## 231 12 1 0.1830 0.0488 0.10851 0.308
## 242 10 1 0.1647 0.0472 0.09389 0.289
## 283 9 1 0.1464 0.0454 0.07973 0.269
## 340 8 1 0.1281 0.0432 0.06609 0.248
## 357 7 1 0.1098 0.0407 0.05304 0.227
## 378 6 1 0.0915 0.0378 0.04067 0.206
## 389 5 1 0.0732 0.0344 0.02912 0.184
## 467 4 1 0.0549 0.0303 0.01861 0.162
## 587 3 1 0.0366 0.0251 0.00953 0.140
## 991 2 1 0.0183 0.0180 0.00265 0.126
## 999 1 1 0.0000 NaN NA NA
summary(fit.km)$table
## records n.max n.start events rmean se(rmean) median 0.95LCL
## trt.f=standard 69 69 69 64 123.9282 14.84352 103.0 59
## trt.f=test 68 68 68 64 142.0613 26.81071 52.5 44
## 0.95UCL
## trt.f=standard 132
## trt.f=test 95
# Create the Kaplan Meier plot that estimates
# the true survival curve. see this link for
# info on plot options:
# https://www.r-bloggers.com/survival-analysis-basics/
ggsurvplot(fit.km, data=v.cancer, ggtheme = theme_minimal(),
xlab="Time in Days",
break.time.by=50,
surv.median.line = "hv", # Specify median survival
ncensor.plot=TRUE,
risk.table = TRUE
)

# Compare treatment arms using log-rank test
survdiff(Surv(time,status) ~ trt.f, data=v.cancer)
## Call:
## survdiff(formula = Surv(time, status) ~ trt.f, data = v.cancer)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt.f=standard 69 64 64.5 0.00388 0.00823
## trt.f=test 68 64 63.5 0.00394 0.00823
##
## Chisq= 0 on 1 degrees of freedom, p= 0.9
# Perform a cox proportional hazards regression with
# randomization arm as the single covariate
mfit <- coxph(Surv(time, status) ~ trt.f, data=v.cancer)
# summary of regression model fit
summary(mfit)
## Call:
## coxph(formula = Surv(time, status) ~ trt.f, data = v.cancer)
##
## n= 137, number of events= 128
##
## coef exp(coef) se(coef) z Pr(>|z|)
## trt.ftest 0.01774 1.01790 0.18066 0.098 0.922
##
## exp(coef) exp(-coef) lower .95 upper .95
## trt.ftest 1.018 0.9824 0.7144 1.45
##
## Concordance= 0.525 (se = 0.026 )
## Likelihood ratio test= 0.01 on 1 df, p=0.9
## Wald test = 0.01 on 1 df, p=0.9
## Score (logrank) test = 0.01 on 1 df, p=0.9
# Question 2
# A randomized clinical trial was carried
# out to investigate the benefit of two surgical
# treatments A and B for patients with multivessel
# coronary artery disease. A total of 300 patients
# were randomized. All patients underwent the assigned
# surgical treatments and then were followed for
# up to 3 years. The primary outcome was the total
# number of days spent in the hospital during the
# follow-up period as it is likely that some patients
# will develop symptoms which require hospitalizations
# during the follow-up period. All patients in the
# study completed follow-up.
# Data Set: cadrct2
# Data Dictionary:
# (1) ID patient id
# (2) STATUS severity (mild, moderate, severe, very severe)
# (3) TREAT binary indicator of treatment arm
# (A = SURG TRT A, B = SURG TRT B)
# (4) DAYS total number of days spent in the hospital
# during the follow-up period
#
# (5) AGE age in years
# (6) SEX set at birth
# (7) EJECFRAC ejection fraction
# Load packages needed during analysis
# install.packages('MASS') # For glm.nb() function
library(MASS)
# install.packages('pscl') # For zeroinfl() function
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
# install.packages("lmtest") # For lrtest() function
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Download and load the data file = cadrct2.RData
load(url("http://www.duke.edu/~sgrambow/crp241data/cadrct2.RData"))
# inspect data
# examine structure of data
str(cadrct2)
## 'data.frame': 300 obs. of 7 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ STATUS : chr "Moderate" "Moderate" "Moderate" "Moderate" ...
## $ TREAT : chr "A" "B" "B" "A" ...
## $ DAYS : int 14 57 0 1 0 0 4 9 0 4 ...
## $ AGE : int 71 51 74 70 69 69 58 55 62 57 ...
## $ SEX : chr "Male" "Male" "Male" "Female" ...
## $ EJECFRAC: int 69 55 49 61 70 71 65 66 66 70 ...
# summarize data
summary(cadrct2)
## ID STATUS TREAT DAYS
## Min. : 1.00 Length:300 Length:300 Min. : 0.00
## 1st Qu.: 75.75 Class :character Class :character 1st Qu.: 3.00
## Median :150.50 Mode :character Mode :character Median : 8.00
## Mean :150.50 Mean : 14.74
## 3rd Qu.:225.25 3rd Qu.: 19.00
## Max. :300.00 Max. :192.00
##
## AGE SEX EJECFRAC
## Min. :32.00 Length:300 Min. :32.00
## 1st Qu.:55.00 Class :character 1st Qu.:53.25
## Median :62.00 Mode :character Median :61.00
## Mean :61.61 Mean :61.21
## 3rd Qu.:69.00 3rd Qu.:70.00
## Max. :80.00 Max. :93.00
## NA's :6
# examine frequency table by treatment group
table(cadrct2$TREAT,cadrct2$DAYS)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 25
## A 32 1 11 9 12 8 4 3 2 4 5 4 0 5 4 2 3 4 2 2 3 0 3 3 1
## B 17 1 9 2 10 8 8 9 9 3 5 1 3 7 6 3 0 4 1 3 0 4 4 2 2
##
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 45 47 48 49 51 53 54 57
## A 0 0 2 1 0 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 1
## B 2 3 1 3 1 0 1 2 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1
##
## 60 61 62 66 71 73 78 82 139 192
## A 1 0 1 3 0 1 0 1 0 0
## B 0 1 0 0 1 0 1 0 1 1
# Data Visualization
# - Visualize the outcome variable
barplot(table(cadrct2$DAYS),
ylab = 'Frequency',
xlab = 'Hospital Days During Follow-up')

# - Visualize by Treatment Arm
barplot(table(cadrct2$TREAT,cadrct2$DAYS),
ylab = 'Frequency',
xlab = 'Hospital Days During Follow-up',
legend=c("TRT_A","TRT_B"), # <--- R orders levels alpha-numerically!
col=c("darkblue","red"))

# Run poisson model and produce effect estimates
fit_poi <- glm(DAYS ~ TREAT, data=cadrct2, family='poisson')
# effect estimates natively on log scale
summary(fit_poi)
##
## Call:
## glm(formula = DAYS ~ TREAT, family = "poisson", data = cadrct2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.726 -3.674 -2.093 1.091 24.366
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.56958 0.02267 113.361 < 2e-16 ***
## TREATB 0.22713 0.03030 7.497 6.52e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 5730.9 on 299 degrees of freedom
## Residual deviance: 5674.3 on 298 degrees of freedom
## AIC: 6742.5
##
## Number of Fisher Scoring iterations: 5
# exponentiate to get rate ratios
exp(fit_poi$coefficients)
## (Intercept) TREATB
## 13.060403 1.254994
# explore evidence of over-dispersion
# - To get an idea if over/under-disperson is present,
# compute mean and variance of # of office visits
mean(cadrct2$DAYS)
## [1] 14.73667
var(cadrct2$DAYS)
## [1] 410.3351
# Test over-dispersion by comparing model
# fits between the standard Poisson and
# the Negative Binomial.
# We are testing whether negative binomial
# is better fit than Poisson
# - Fit a standard Poisson regression model
# we already have this from above -- fit_poi
# - Fit a Negative Binomial model that accounts for over-dispersion
nbin_fit <- glm.nb(DAYS ~ TREAT, data=cadrct2)
summary(nbin_fit)
##
## Call:
## glm.nb(formula = DAYS ~ TREAT, data = cadrct2, init.theta = 0.6469400427,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0572 -0.9119 -0.4718 0.2169 3.2303
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.5696 0.1043 24.626 <2e-16 ***
## TREATB 0.2271 0.1467 1.548 0.122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.6469) family taken to be 1)
##
## Null deviance: 356.96 on 299 degrees of freedom
## Residual deviance: 354.57 on 298 degrees of freedom
## AIC: 2208.7
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.6469
## Std. Err.: 0.0552
##
## 2 x log-likelihood: -2202.7320
lrtest(nbin_fit,fit_poi)
## Warning in modelUpdate(objects[[i - 1]], objects[[i]]): original model was of
## class "negbin", updated model is of class "glm"
## Likelihood ratio test
##
## Model 1: DAYS ~ TREAT
## Model 2: DAYS ~ TREAT
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 3 -1101.4
## 2 2 -3369.2 -1 4535.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test for Zero-Inflation
# - Fit a Negative Binomial regression model that
# accounts for over-dispersion but ignores the excess zeros
# this is just nbin_fit above
# - Fit a Zero-Inflated Negative Binomial regression model that
# accounts for both over-dispersion and the excess zeros
zi_nbin_fit <- zeroinfl(DAYS ~ TREAT, data = cadrct2, dist = "negbin")
summary(zi_nbin_fit)
##
## Call:
## zeroinfl(formula = DAYS ~ TREAT, data = cadrct2, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.8982 -0.6790 -0.4216 0.2526 9.6237
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.74179 0.09932 27.604 <2e-16 ***
## TREATB 0.11161 0.13368 0.835 0.404
## Log(theta) -0.05670 0.12434 -0.456 0.648
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6717 0.2948 -5.671 1.42e-08 ***
## TREATB -1.1700 0.6287 -1.861 0.0627 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.9449
## Number of iterations in BFGS optimization: 11
## Log-likelihood: -1094 on 5 Df
# - Test whether excess of 0's is statistically significant
# - Use the Raw p-value to make any conclusions about significance
# - the AIC and BIC are corrections sometimes used for comparisons
# - when models are more complex -- here we just have the one predictor
vuong(nbin_fit, zi_nbin_fit)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -1.8860021 model2 > model1 0.029647
## AIC-corrected -1.3776387 model2 > model1 0.084157
## BIC-corrected -0.4362051 model2 > model1 0.331344
#
# END OF PROGRAM