# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Homework 2  ~
# ~   Question 1          ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# These data are from a veteran's Administration lung cancer trial.  Males 
# with advanced inoperable lung cancer were randomized to either a standard 
# or test therapy.  The data include information on a number of covariates
# measuring different patient characteristics.

# Data Set: veteran

# Data Dictionary: 
# trt:        randomization status -- 1=standard therapy 2=test therapy
# celltype:   1=squamous, 2=smallcell, 3=adeno, 4=large
# time:       survival time (from start of study to death), in days
# status:     censoring status, 0=patient death not observed (so censored);
#             1=patient death was observed
# karno:      Karnofsky performance score  (quantifies cancer patients' 
#             general well-being and activities of daily life, 0 = Dead 
#             to 100 = Normal)
# diagtime:   months from diagnosis to randomization, in months
# age:        age of the patient in years
# prior:      prior therapy 0=no, 10=yes
# load survival library for analysis functions
library(survival)

# load survminer package for plotting
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
# The dataset, "veteran", is part of the survival package
# we can directly access it because we just loaded the
# survival package above lets just assign it to the data 
# frame name 'v.cancer'

v.cancer <- veteran

# lets change trt into a factor (trt.f) so it is 
# easier to remember which arm is which
# we know to put standard as the first label because it is 1
# and test therapy is 2.
v.cancer$trt.f <- factor(v.cancer$trt, labels = c("standard", "test"))
# examine structure of data
str(v.cancer)
## 'data.frame':    137 obs. of  9 variables:
##  $ trt     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ celltype: Factor w/ 4 levels "squamous","smallcell",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time    : num  72 411 228 126 118 10 82 110 314 100 ...
##  $ status  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ karno   : num  60 70 60 60 70 20 40 80 50 70 ...
##  $ diagtime: num  7 5 3 9 11 5 10 29 18 6 ...
##  $ age     : num  69 64 38 63 65 49 69 68 43 70 ...
##  $ prior   : num  0 10 0 10 10 0 10 0 0 0 ...
##  $ trt.f   : Factor w/ 2 levels "standard","test": 1 1 1 1 1 1 1 1 1 1 ...
# summarize data
summary(v.cancer)
##       trt             celltype       time           status      
##  Min.   :1.000   squamous :35   Min.   :  1.0   Min.   :0.0000  
##  1st Qu.:1.000   smallcell:48   1st Qu.: 25.0   1st Qu.:1.0000  
##  Median :1.000   adeno    :27   Median : 80.0   Median :1.0000  
##  Mean   :1.496   large    :27   Mean   :121.6   Mean   :0.9343  
##  3rd Qu.:2.000                  3rd Qu.:144.0   3rd Qu.:1.0000  
##  Max.   :2.000                  Max.   :999.0   Max.   :1.0000  
##      karno          diagtime           age            prior            trt.f   
##  Min.   :10.00   Min.   : 1.000   Min.   :34.00   Min.   : 0.00   standard:69  
##  1st Qu.:40.00   1st Qu.: 3.000   1st Qu.:51.00   1st Qu.: 0.00   test    :68  
##  Median :60.00   Median : 5.000   Median :62.00   Median : 0.00                
##  Mean   :58.57   Mean   : 8.774   Mean   :58.31   Mean   : 2.92                
##  3rd Qu.:75.00   3rd Qu.:11.000   3rd Qu.:66.00   3rd Qu.:10.00                
##  Max.   :99.00   Max.   :87.000   Max.   :81.00   Max.   :10.00
# Create Kaplan Meier life table curves by treatment arm
fit.km <- survfit(Surv(time,status) ~ trt.f, data=v.cancer)
summary(fit.km,censor=FALSE)
## Call: survfit(formula = Surv(time, status) ~ trt.f, data = v.cancer)
## 
##                 trt.f=standard 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     3     69       1   0.9855  0.0144      0.95771        1.000
##     4     68       1   0.9710  0.0202      0.93223        1.000
##     7     67       1   0.9565  0.0246      0.90959        1.000
##     8     66       2   0.9275  0.0312      0.86834        0.991
##    10     64       2   0.8986  0.0363      0.83006        0.973
##    11     62       1   0.8841  0.0385      0.81165        0.963
##    12     61       2   0.8551  0.0424      0.77592        0.942
##    13     59       1   0.8406  0.0441      0.75849        0.932
##    16     58       1   0.8261  0.0456      0.74132        0.921
##    18     57       2   0.7971  0.0484      0.70764        0.898
##    20     55       1   0.7826  0.0497      0.69109        0.886
##    21     54       1   0.7681  0.0508      0.67472        0.874
##    22     53       1   0.7536  0.0519      0.65851        0.862
##    27     51       1   0.7388  0.0529      0.64208        0.850
##    30     50       1   0.7241  0.0539      0.62580        0.838
##    31     49       1   0.7093  0.0548      0.60967        0.825
##    35     48       1   0.6945  0.0556      0.59368        0.812
##    42     47       1   0.6797  0.0563      0.57782        0.800
##    51     46       1   0.6650  0.0570      0.56209        0.787
##    52     45       1   0.6502  0.0576      0.54649        0.774
##    54     44       2   0.6206  0.0587      0.51565        0.747
##    56     42       1   0.6059  0.0591      0.50040        0.734
##    59     41       1   0.5911  0.0595      0.48526        0.720
##    63     40       1   0.5763  0.0598      0.47023        0.706
##    72     39       1   0.5615  0.0601      0.45530        0.693
##    82     38       1   0.5467  0.0603      0.44049        0.679
##    92     37       1   0.5320  0.0604      0.42577        0.665
##    95     36       1   0.5172  0.0605      0.41116        0.651
##   100     34       1   0.5020  0.0606      0.39615        0.636
##   103     32       1   0.4863  0.0607      0.38070        0.621
##   105     31       1   0.4706  0.0608      0.36537        0.606
##   110     30       1   0.4549  0.0607      0.35018        0.591
##   117     29       2   0.4235  0.0605      0.32017        0.560
##   118     27       1   0.4079  0.0602      0.30537        0.545
##   122     26       1   0.3922  0.0599      0.29069        0.529
##   126     24       1   0.3758  0.0596      0.27542        0.513
##   132     23       1   0.3595  0.0592      0.26031        0.496
##   139     22       1   0.3432  0.0587      0.24535        0.480
##   143     21       1   0.3268  0.0582      0.23057        0.463
##   144     20       1   0.3105  0.0575      0.21595        0.446
##   151     19       1   0.2941  0.0568      0.20151        0.429
##   153     18       1   0.2778  0.0559      0.18725        0.412
##   156     17       1   0.2614  0.0550      0.17317        0.395
##   162     16       2   0.2288  0.0527      0.14563        0.359
##   177     14       1   0.2124  0.0514      0.13218        0.341
##   200     12       1   0.1947  0.0501      0.11761        0.322
##   216     11       1   0.1770  0.0486      0.10340        0.303
##   228     10       1   0.1593  0.0468      0.08956        0.283
##   250      9       1   0.1416  0.0448      0.07614        0.263
##   260      8       1   0.1239  0.0426      0.06318        0.243
##   278      7       1   0.1062  0.0400      0.05076        0.222
##   287      6       1   0.0885  0.0371      0.03896        0.201
##   314      5       1   0.0708  0.0336      0.02793        0.180
##   384      4       1   0.0531  0.0295      0.01788        0.158
##   392      3       1   0.0354  0.0244      0.00917        0.137
##   411      2       1   0.0177  0.0175      0.00256        0.123
##   553      1       1   0.0000     NaN           NA           NA
## 
##                 trt.f=test 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     68       2   0.9706  0.0205      0.93125        1.000
##     2     66       1   0.9559  0.0249      0.90830        1.000
##     7     65       2   0.9265  0.0317      0.86647        0.991
##     8     63       2   0.8971  0.0369      0.82766        0.972
##    13     61       1   0.8824  0.0391      0.80900        0.962
##    15     60       2   0.8529  0.0429      0.77278        0.941
##    18     58       1   0.8382  0.0447      0.75513        0.930
##    19     57       2   0.8088  0.0477      0.72056        0.908
##    20     55       1   0.7941  0.0490      0.70360        0.896
##    21     54       1   0.7794  0.0503      0.68684        0.884
##    24     53       2   0.7500  0.0525      0.65383        0.860
##    25     51       3   0.7059  0.0553      0.60548        0.823
##    29     48       1   0.6912  0.0560      0.58964        0.810
##    30     47       1   0.6765  0.0567      0.57394        0.797
##    31     46       1   0.6618  0.0574      0.55835        0.784
##    33     45       1   0.6471  0.0580      0.54289        0.771
##    36     44       1   0.6324  0.0585      0.52754        0.758
##    43     43       1   0.6176  0.0589      0.51230        0.745
##    44     42       1   0.6029  0.0593      0.49717        0.731
##    45     41       1   0.5882  0.0597      0.48216        0.718
##    48     40       1   0.5735  0.0600      0.46724        0.704
##    49     39       1   0.5588  0.0602      0.45244        0.690
##    51     38       2   0.5294  0.0605      0.42313        0.662
##    52     36       2   0.5000  0.0606      0.39423        0.634
##    53     34       1   0.4853  0.0606      0.37993        0.620
##    61     33       1   0.4706  0.0605      0.36573        0.606
##    73     32       1   0.4559  0.0604      0.35163        0.591
##    80     31       2   0.4265  0.0600      0.32373        0.562
##    84     28       1   0.4112  0.0597      0.30935        0.547
##    87     27       1   0.3960  0.0594      0.29509        0.531
##    90     25       1   0.3802  0.0591      0.28028        0.516
##    95     24       1   0.3643  0.0587      0.26560        0.500
##    99     23       2   0.3326  0.0578      0.23670        0.467
##   111     20       2   0.2994  0.0566      0.20673        0.434
##   112     18       1   0.2827  0.0558      0.19203        0.416
##   133     17       1   0.2661  0.0550      0.17754        0.399
##   140     16       1   0.2495  0.0540      0.16326        0.381
##   164     15       1   0.2329  0.0529      0.14920        0.363
##   186     14       1   0.2162  0.0517      0.13538        0.345
##   201     13       1   0.1996  0.0503      0.12181        0.327
##   231     12       1   0.1830  0.0488      0.10851        0.308
##   242     10       1   0.1647  0.0472      0.09389        0.289
##   283      9       1   0.1464  0.0454      0.07973        0.269
##   340      8       1   0.1281  0.0432      0.06609        0.248
##   357      7       1   0.1098  0.0407      0.05304        0.227
##   378      6       1   0.0915  0.0378      0.04067        0.206
##   389      5       1   0.0732  0.0344      0.02912        0.184
##   467      4       1   0.0549  0.0303      0.01861        0.162
##   587      3       1   0.0366  0.0251      0.00953        0.140
##   991      2       1   0.0183  0.0180      0.00265        0.126
##   999      1       1   0.0000     NaN           NA           NA
summary(fit.km)$table 
##                records n.max n.start events    rmean se(rmean) median 0.95LCL
## trt.f=standard      69    69      69     64 123.9282  14.84352  103.0      59
## trt.f=test          68    68      68     64 142.0613  26.81071   52.5      44
##                0.95UCL
## trt.f=standard     132
## trt.f=test          95
# Create the Kaplan Meier plot that estimates 
# the true survival curve. see this link for 
# info on plot options:
# https://www.r-bloggers.com/survival-analysis-basics/
ggsurvplot(fit.km, data=v.cancer, ggtheme = theme_minimal(),
           xlab="Time in Days",
           break.time.by=50,
           surv.median.line = "hv", # Specify median survival
           ncensor.plot=TRUE,
           risk.table = TRUE
           )

# Compare treatment arms using log-rank test
survdiff(Surv(time,status) ~ trt.f, data=v.cancer)
## Call:
## survdiff(formula = Surv(time, status) ~ trt.f, data = v.cancer)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## trt.f=standard 69       64     64.5   0.00388   0.00823
## trt.f=test     68       64     63.5   0.00394   0.00823
## 
##  Chisq= 0  on 1 degrees of freedom, p= 0.9
# Perform a cox proportional hazards regression with 
# randomization arm as the single covariate
mfit <- coxph(Surv(time, status) ~ trt.f, data=v.cancer)

# summary of regression model fit
summary(mfit)
## Call:
## coxph(formula = Surv(time, status) ~ trt.f, data = v.cancer)
## 
##   n= 137, number of events= 128 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)
## trt.ftest 0.01774   1.01790  0.18066 0.098    0.922
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## trt.ftest     1.018     0.9824    0.7144      1.45
## 
## Concordance= 0.525  (se = 0.026 )
## Likelihood ratio test= 0.01  on 1 df,   p=0.9
## Wald test            = 0.01  on 1 df,   p=0.9
## Score (logrank) test = 0.01  on 1 df,   p=0.9
# Question 2

# A randomized clinical trial was carried 
# out to investigate the benefit of two surgical 
# treatments A and B for patients with multivessel 
# coronary artery disease. A total of 300 patients 
# were randomized. All patients underwent the assigned 
# surgical treatments and then were followed for 
# up to 3 years.  The primary outcome was the total
# number of days spent in the hospital during the 
# follow-up period as it is likely that some patients 
# will develop symptoms which require hospitalizations 
# during the follow-up period. All patients in the 
# study completed follow-up. 

# Data Set: cadrct2

# Data Dictionary: 
# (1) ID         patient id 
# (2) STATUS     severity (mild, moderate, severe, very severe)
# (3) TREAT      binary indicator of treatment arm 
#                    (A = SURG TRT A, B = SURG TRT B)
# (4) DAYS       total number of days spent in the hospital 
#                    during the follow-up period 
#                    
# (5) AGE        age in years
# (6) SEX        set at birth
# (7) EJECFRAC   ejection fraction

# Load packages needed during analysis
# install.packages('MASS')          # For glm.nb() function
library(MASS)
# install.packages('pscl')          # For zeroinfl() function 
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
# install.packages("lmtest")        # For lrtest() function
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Download and load the data file = cadrct2.RData
load(url("http://www.duke.edu/~sgrambow/crp241data/cadrct2.RData"))

# inspect data
# examine structure of data
str(cadrct2)
## 'data.frame':    300 obs. of  7 variables:
##  $ ID      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ STATUS  : chr  "Moderate" "Moderate" "Moderate" "Moderate" ...
##  $ TREAT   : chr  "A" "B" "B" "A" ...
##  $ DAYS    : int  14 57 0 1 0 0 4 9 0 4 ...
##  $ AGE     : int  71 51 74 70 69 69 58 55 62 57 ...
##  $ SEX     : chr  "Male" "Male" "Male" "Female" ...
##  $ EJECFRAC: int  69 55 49 61 70 71 65 66 66 70 ...
# summarize data
summary(cadrct2)
##        ID            STATUS             TREAT                DAYS       
##  Min.   :  1.00   Length:300         Length:300         Min.   :  0.00  
##  1st Qu.: 75.75   Class :character   Class :character   1st Qu.:  3.00  
##  Median :150.50   Mode  :character   Mode  :character   Median :  8.00  
##  Mean   :150.50                                         Mean   : 14.74  
##  3rd Qu.:225.25                                         3rd Qu.: 19.00  
##  Max.   :300.00                                         Max.   :192.00  
##                                                                         
##       AGE            SEX               EJECFRAC    
##  Min.   :32.00   Length:300         Min.   :32.00  
##  1st Qu.:55.00   Class :character   1st Qu.:53.25  
##  Median :62.00   Mode  :character   Median :61.00  
##  Mean   :61.61                      Mean   :61.21  
##  3rd Qu.:69.00                      3rd Qu.:70.00  
##  Max.   :80.00                      Max.   :93.00  
##                                     NA's   :6
# examine frequency table by treatment group
table(cadrct2$TREAT,cadrct2$DAYS)
##    
##      0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 25
##   A 32  1 11  9 12  8  4  3  2  4  5  4  0  5  4  2  3  4  2  2  3  0  3  3  1
##   B 17  1  9  2 10  8  8  9  9  3  5  1  3  7  6  3  0  4  1  3  0  4  4  2  2
##    
##     26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 45 47 48 49 51 53 54 57
##   A  0  0  2  1  0  2  1  1  1  1  0  0  0  0  0  1  1  0  0  0  1  1  0  1  1
##   B  2  3  1  3  1  0  1  2  1  1  1  1  1  1  1  0  0  1  1  1  0  0  1  0  1
##    
##     60 61 62 66 71 73 78 82 139 192
##   A  1  0  1  3  0  1  0  1   0   0
##   B  0  1  0  0  1  0  1  0   1   1
# Data Visualization

# - Visualize the outcome variable
barplot(table(cadrct2$DAYS),
        ylab = 'Frequency',
        xlab = 'Hospital Days During Follow-up')

# - Visualize by Treatment Arm
barplot(table(cadrct2$TREAT,cadrct2$DAYS),
        ylab = 'Frequency',
        xlab = 'Hospital Days During Follow-up',
        legend=c("TRT_A","TRT_B"),     # <--- R orders levels alpha-numerically!
        col=c("darkblue","red"))

# Run poisson model and produce effect estimates
fit_poi <- glm(DAYS ~ TREAT, data=cadrct2, family='poisson')

# effect estimates natively on log scale
summary(fit_poi)
## 
## Call:
## glm(formula = DAYS ~ TREAT, family = "poisson", data = cadrct2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -5.726  -3.674  -2.093   1.091  24.366  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.56958    0.02267 113.361  < 2e-16 ***
## TREATB       0.22713    0.03030   7.497 6.52e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 5730.9  on 299  degrees of freedom
## Residual deviance: 5674.3  on 298  degrees of freedom
## AIC: 6742.5
## 
## Number of Fisher Scoring iterations: 5
# exponentiate to get rate ratios
exp(fit_poi$coefficients)
## (Intercept)      TREATB 
##   13.060403    1.254994
# explore evidence of over-dispersion

# - To get an idea if over/under-disperson is present, 
#   compute mean and variance of # of office visits  
mean(cadrct2$DAYS)
## [1] 14.73667
var(cadrct2$DAYS)
## [1] 410.3351
# Test over-dispersion by comparing model 
# fits between the standard Poisson and 
# the Negative Binomial. 
# We are testing whether negative binomial 
# is better fit than Poisson

# - Fit a standard Poisson regression model 
# we already have this from above -- fit_poi

# - Fit a Negative Binomial model that accounts for over-dispersion 
nbin_fit <- glm.nb(DAYS ~ TREAT, data=cadrct2)
summary(nbin_fit)
## 
## Call:
## glm.nb(formula = DAYS ~ TREAT, data = cadrct2, init.theta = 0.6469400427, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0572  -0.9119  -0.4718   0.2169   3.2303  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.5696     0.1043  24.626   <2e-16 ***
## TREATB        0.2271     0.1467   1.548    0.122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.6469) family taken to be 1)
## 
##     Null deviance: 356.96  on 299  degrees of freedom
## Residual deviance: 354.57  on 298  degrees of freedom
## AIC: 2208.7
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.6469 
##           Std. Err.:  0.0552 
## 
##  2 x log-likelihood:  -2202.7320
lrtest(nbin_fit,fit_poi)
## Warning in modelUpdate(objects[[i - 1]], objects[[i]]): original model was of
## class "negbin", updated model is of class "glm"
## Likelihood ratio test
## 
## Model 1: DAYS ~ TREAT
## Model 2: DAYS ~ TREAT
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   3 -1101.4                         
## 2   2 -3369.2 -1 4535.7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test for Zero-Inflation

# - Fit a Negative Binomial regression model that 
#   accounts for over-dispersion but ignores the excess zeros 
#   this is just nbin_fit above

# - Fit a Zero-Inflated Negative Binomial regression model that 
#   accounts for both over-dispersion and  the excess zeros 
zi_nbin_fit <- zeroinfl(DAYS ~ TREAT, data = cadrct2, dist = "negbin")
summary(zi_nbin_fit)
## 
## Call:
## zeroinfl(formula = DAYS ~ TREAT, data = cadrct2, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8982 -0.6790 -0.4216  0.2526  9.6237 
## 
## Count model coefficients (negbin with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.74179    0.09932  27.604   <2e-16 ***
## TREATB       0.11161    0.13368   0.835    0.404    
## Log(theta)  -0.05670    0.12434  -0.456    0.648    
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6717     0.2948  -5.671 1.42e-08 ***
## TREATB       -1.1700     0.6287  -1.861   0.0627 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.9449 
## Number of iterations in BFGS optimization: 11 
## Log-likelihood: -1094 on 5 Df
# - Test whether excess of 0's is statistically significant 
# - Use the Raw p-value to make any conclusions about significance
# - the AIC and BIC are corrections sometimes used for comparisons
# - when models are more complex -- here we just have the one predictor
vuong(nbin_fit, zi_nbin_fit)
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A  p-value
## Raw                  -1.8860021 model2 > model1 0.029647
## AIC-corrected        -1.3776387 model2 > model1 0.084157
## BIC-corrected        -0.4362051 model2 > model1 0.331344
#
# END OF PROGRAM