Load libraries

rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
set.seed(10) 
require(boot)
## Loading required package: boot
require(pscl)
## Loading required package: pscl
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
require(ggplot2)
## Loading required package: ggplot2
require(VGAM)
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following objects are masked from 'package:boot':
## 
##     logit, simplex
require(lmtest)
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'lmtest'
## The following object is masked from 'package:VGAM':
## 
##     lrtest

Prepare data

file <- "sample_person.csv"
data <- read.csv(file,header=TRUE)
rownames(data) <- data$ID
data <- data[,-1]
data$htn <- as.factor(data$htn)
data$diabetes <- as.factor(data$diabetes)
summary(data) # data looks OK
##  htn     diabetes      age              LOS       
##  0:807   0:695    Min.   : 85.00   Min.   : 0.00  
##  1:193   1:305    1st Qu.: 89.00   1st Qu.: 0.00  
##                   Median : 94.00   Median : 0.00  
##                   Mean   : 93.86   Mean   : 2.23  
##                   3rd Qu.: 98.00   3rd Qu.: 0.00  
##                   Max.   :103.00   Max.   :22.00
data$admit <- data$LOS > 0 # for convenience, create a flag for admitted (LOS > 0)
data$age.c <- data$age - mean(data$age) # center age for more interpretable modeling
admitted <- subset(data,data$admit==TRUE) # for convenience, create a subset of data for admitted patients

Explore distribution of length of stay

Histogram of length of stay suggests two populations:

Thus, this distribution has two features:

Conceptually, this might differ from a zero-inflated distribution where zero values might be observed in the ‘count’ population. One example that I read considered a population of people who either fished or did not fish - of those who did fish, some would still catch zero. The attribute of having fished or not is not observed. As I understand this data, I think that a hurdle model will be more appropriate than a zero inflated model (the two differ on whether zero-count observations are observed to possibly be part of the count distribution).

The binomial distribution for admission has probability of 20%.

The poisson distribution for length of stay given admission has expected value 11 and variance 11.5, corresponding to lambda = 11. Given that mean = variance, it will be appropriate to use a truncated poisson distribution (data are not over-dispersed). There is no logical offset for the distribution.

hist(data$LOS)

hist(admitted$LOS)

Explore bivariate associations

Graphs may be used to identify potential associations between the predictor variables and the two response conditions (admitted vs. not admitted, and length of stay for admitted patients).

Observations:

mosaicplot(table(data$admit,data$htn))

mosaicplot(table(data$admit,data$diabetes))

boxplot(age~admit,data=data)

boxplot(LOS~htn,data=subset(data,data$admit==TRUE))

boxplot(LOS~diabetes,data=subset(data,data$admit==TRUE))

boxplot(LOS~age,data=subset(data,data$admit==TRUE))

plot(LOS~age,data=subset(data,data$admit==TRUE))

cor(admitted$LOS,admitted$age)
## [1] 0.0790466

Create models

A few models are generated for comparison and for learning:

Models are then compared using the Vuong statistic (for non-nested models) or likelihood ratio test (nested). There is no significant difference between zero inflated and hurdle models, but the hurdle model remains the correct choice given model assumptions. Adding age to the admission component improves model fit. Fit did not improve after adding hypertension or diabetes to the count component.

bi1 <- glm(admit~age.c + htn + diabetes, family = "binomial",data=data)
po1 <- glm(LOS ~ age.c, data=data, family = "poisson")
po2 <- vglm(LOS ~ age.c, data=admitted, family = pospoisson()) 
zi1 <- zeroinfl(LOS ~ age.c | htn + diabetes, data=data, dist="poisson") 
hu1 <- hurdle(LOS ~ age.c | htn + diabetes,data=data,dist="poisson")
hu2 <- hurdle(LOS ~ age.c | htn + diabetes + age.c, data=data,dist="poisson")
hu3 <- hurdle(LOS ~ age.c + htn | htn + diabetes + age.c, data=data,dist="poisson")
hu4 <- hurdle(LOS ~ age.c + diabetes | htn + diabetes + age.c, data=data,dist="poisson")
vuong(po1,zi1) # Zero inflated model significantly better fit vs. poisson 
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -29.35885 model2 > model1 < 2.22e-16
## AIC-corrected         -29.33000 model2 > model1 < 2.22e-16
## BIC-corrected         -29.25921 model2 > model1 < 2.22e-16
vuong(po1,hu1) # Hurdle model significantly better fit vs poisson
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -29.35883 model2 > model1 < 2.22e-16
## AIC-corrected         -29.32998 model2 > model1 < 2.22e-16
## BIC-corrected         -29.25918 model2 > model1 < 2.22e-16
vuong(zi1,hu1) # No significant difference in model fit for hurdle vs. zero inflated
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A p-value
## Raw                  -0.1649731 model2 > model1 0.43448
## AIC-corrected        -0.1649731 model2 > model1 0.43448
## BIC-corrected        -0.1649731 model2 > model1 0.43448
lmtest::lrtest(hu1,hu2) # Model fit significantly improved for model hu2 (including age as predictor for length of stay)
## Likelihood ratio test
## 
## Model 1: LOS ~ age.c | htn + diabetes
## Model 2: LOS ~ age.c | htn + diabetes + age.c
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1   5 -1023.8                       
## 2   6 -1021.5  1 4.6566    0.03093 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmtest::lrtest(hu2,hu3)
## Likelihood ratio test
## 
## Model 1: LOS ~ age.c | htn + diabetes + age.c
## Model 2: LOS ~ age.c + htn | htn + diabetes + age.c
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   6 -1021.5                     
## 2   7 -1021.3  1 0.3952     0.5296
lmtest::lrtest(hu2,hu4)
## Likelihood ratio test
## 
## Model 1: LOS ~ age.c | htn + diabetes + age.c
## Model 2: LOS ~ age.c + diabetes | htn + diabetes + age.c
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1   6 -1021.5                    
## 2   7 -1021.5  1 0.003     0.9562

Bootstrap sampling for confidence intervals

Finally, as an adjunct to the work done up to this point I used a bootstrap analysis to estimate confidence intervals for the effect size. These were not appreciably different from the confidence intervals estimated from the observed data.

# Reference https://stats.idre.ucla.edu/r/dae/zip/
f <- function(data, i) {
  require(pscl) 
  m <- hurdle(LOS ~ age.c | htn + diabetes + age.c, data = data[i, ],
    start = list(count = c(2.40, 0.0047), zero = c(-1.64,0.37, 0.52,-0.033)))
  as.vector(t(do.call(rbind, coef(summary(m)))[, 1:2]))
} # define function for the bootstrap call - define model, provide starting values = modeled values (to reduce computational load), bind the observation and standard error
res <- boot(data, f, R = 1000, parallel = "snow", ncpus = 2) # Currently on a two-core machine

## basic parameter estimates with percentile and bias adjusted CIs
expparms <- t(sapply(c(1, 3, 5, 7, 9, 11), function(i) {
  out <- boot.ci(res, index = c(i, i + 1), type = c("perc", "bca"), h = exp)
  with(out, c(Est = t0, pLL = percent[4], pUL = percent[5],
     bcaLL = bca[4], bcaUL = bca[5]))
})) # Corrected label for upper limit and added an additional index to the expparms call

row.names(expparms) <- names(coef(hu2)) 
expparms
##                          Est        pLL        pUL      bcaLL      bcaUL
## count_(Intercept) 11.0776342 10.6258873 11.5624793 10.6289923 11.5647610
## count_age.c        1.0046623  0.9964441  1.0129406  0.9968199  1.0133725
## zero_(Intercept)   0.1938479  0.1526857  0.2367038  0.1527897  0.2367430
## zero_htn1          1.4544520  0.9836889  2.1220137  0.9879981  2.1357848
## zero_diabetes1     1.6896894  1.2019115  2.3238459  1.1927634  2.3130037
## zero_age.c         0.9675175  0.9378520  0.9974358  0.9374202  0.9969964
exp(confint(hu2))
##                        2.5 %     97.5 %
## count_(Intercept) 10.6234677 11.5512041
## count_age.c        0.9966698  1.0127187
## zero_(Intercept)   0.1561533  0.2406418
## zero_htn1          1.0022192  2.1107465
## zero_diabetes1     1.2230687  2.3343337
## zero_age.c         0.9388184  0.9970938
exp(coef(hu2))
## count_(Intercept)       count_age.c  zero_(Intercept)         zero_htn1 
##        11.0776281         1.0046622         0.1938479         1.4544520 
##    zero_diabetes1        zero_age.c 
##         1.6896894         0.9675175