1.Data preparation

Prepare data
file <- "sample_person.csv"
data <- read.csv(file,header=TRUE)
rownames(data) <- data$ID
data <- data[,-1]
data$htn <- as.factor(data$htn)
data$diabetes <- as.factor(data$diabetes)
summary(data) # data looks OK
##  htn     diabetes      age              LOS       
##  0:807   0:695    Min.   : 85.00   Min.   : 0.00  
##  1:193   1:305    1st Qu.: 89.00   1st Qu.: 0.00  
##                   Median : 94.00   Median : 0.00  
##                   Mean   : 93.86   Mean   : 2.23  
##                   3rd Qu.: 98.00   3rd Qu.: 0.00  
##                   Max.   :103.00   Max.   :22.00
data$admit <- data$LOS > 0 # for convenience, create a flag for admitted (LOS > 0)
data$age.c <- data$age - mean(data$age) # center age for more interpretable modeling
admitted <- subset(data,data$admit==TRUE) # for convenience, create a subset of data for admitted patients

2. Create models

A few models are generated for comparison and for learning:

Binomial model for admission, including centered age, hypertension and diabetes
Poisson distribution for length of stay including centered age
Truncated poisson distribution for length of stay including centered age, restricted to admitted patients only (data=admitted)
Zero inflation model for length of stay including centered age for count component, and hypertension and diabetes for admission component
Hurdle model for length of stay, same predictor variables
Hurdle model for length of stay, adding age to the admission component
Models are then compared using the Vuong statistic (for non-nested models) or likelihood ratio test (nested). There is no significant difference between zero inflated and hurdle models, but the hurdle model remains the correct choice given model assumptions. Adding age to the admission component improves model fit. Fit did not improve after adding hypertension or diabetes to the count component.

3. HURDLE Model

bi1 <- glm(admit~age.c + htn + diabetes, family = "binomial",data=data)
po1 <- glm(LOS ~ age.c, data=data, family = "poisson")
po2 <- vglm(LOS ~ age.c, data=admitted, family = pospoisson()) 
zi1 <- zeroinfl(LOS ~ age.c | htn + diabetes, data=data, dist="poisson") 
hu1 <- hurdle(LOS ~ age.c | htn + diabetes,data=data,dist="poisson")
hu2 <- hurdle(LOS ~ age.c | htn + diabetes + age.c, data=data,dist="poisson")
hu3 <- hurdle(LOS ~ age.c + htn | htn + diabetes + age.c, data=data,dist="poisson")
hu4 <- hurdle(LOS ~ age.c + diabetes | htn + diabetes + age.c, data=data,dist="poisson")
vuong(po1,zi1) # Zero inflated model significantly better fit vs. poisson 
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -29.35885 model2 > model1 < 2.22e-16
## AIC-corrected         -29.33000 model2 > model1 < 2.22e-16
## BIC-corrected         -29.25921 model2 > model1 < 2.22e-16
vuong(po1,hu1) # Hurdle model significantly better fit vs poisson
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -29.35883 model2 > model1 < 2.22e-16
## AIC-corrected         -29.32998 model2 > model1 < 2.22e-16
## BIC-corrected         -29.25918 model2 > model1 < 2.22e-16
vuong(zi1,hu1) # No significant difference in model fit for hurdle vs. zero inflated
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A p-value
## Raw                  -0.1649731 model2 > model1 0.43448
## AIC-corrected        -0.1649731 model2 > model1 0.43448
## BIC-corrected        -0.1649731 model2 > model1 0.43448
lmtest::lrtest(hu1,hu2) # Model fit significantly improved for model hu2 (including age as predictor for length of stay)
## Likelihood ratio test
## 
## Model 1: LOS ~ age.c | htn + diabetes
## Model 2: LOS ~ age.c | htn + diabetes + age.c
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1   5 -1023.8                       
## 2   6 -1021.5  1 4.6566    0.03093 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmtest::lrtest(hu2,hu3)
## Likelihood ratio test
## 
## Model 1: LOS ~ age.c | htn + diabetes + age.c
## Model 2: LOS ~ age.c + htn | htn + diabetes + age.c
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   6 -1021.5                     
## 2   7 -1021.3  1 0.3952     0.5296
lmtest::lrtest(hu2,hu4)
## Likelihood ratio test
## 
## Model 1: LOS ~ age.c | htn + diabetes + age.c
## Model 2: LOS ~ age.c + diabetes | htn + diabetes + age.c
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1   6 -1021.5                    
## 2   7 -1021.5  1 0.003     0.9562

4.Analysis

Bootstrap sampling for confidence intervals
Finally, as an adjunct to the work done up to this point I used a bootstrap analysis to estimate confidence intervals for the effect size. These were not appreciably different from the confidence intervals estimated from the observed data.

# Reference https://stats.idre.ucla.edu/r/dae/zip/
f <- function(data, i) {
  require(pscl) 
  m <- hurdle(LOS ~ age.c | htn + diabetes + age.c, data = data[i, ],
    start = list(count = c(2.40, 0.0047), zero = c(-1.64,0.37, 0.52,-0.033)))
  as.vector(t(do.call(rbind, coef(summary(m)))[, 1:2]))
} # define function for the bootstrap call - define model, provide starting values = modeled values (to reduce computational load), bind the observation and standard error
res <- boot(data, f, R = 1000, parallel = "snow", ncpus = 2) # Currently on a two-core machine

## basic parameter estimates with percentile and bias adjusted CIs
expparms <- t(sapply(c(1, 3, 5, 7, 9, 11), function(i) {
  out <- boot.ci(res, index = c(i, i + 1), type = c("perc", "bca"), h = exp)
  with(out, c(Est = t0, pLL = percent[4], pUL = percent[5],
     bcaLL = bca[4], bcaUL = bca[5]))
})) # Corrected label for upper limit and added an additional index to the expparms call

row.names(expparms) <- names(coef(hu2)) 
expparms
##                          Est        pLL        pUL      bcaLL      bcaUL
## count_(Intercept) 11.0776342 10.6258873 11.5624793 10.6289923 11.5647610
## count_age.c        1.0046623  0.9964441  1.0129406  0.9968199  1.0133725
## zero_(Intercept)   0.1938479  0.1526857  0.2367038  0.1527897  0.2367430
## zero_htn1          1.4544520  0.9836889  2.1220137  0.9879981  2.1357848
## zero_diabetes1     1.6896894  1.2019115  2.3238459  1.1927634  2.3130037
## zero_age.c         0.9675175  0.9378520  0.9974358  0.9374202  0.9969964
exp(confint(hu2))
##                        2.5 %     97.5 %
## count_(Intercept) 10.6234677 11.5512041
## count_age.c        0.9966698  1.0127187
## zero_(Intercept)   0.1561533  0.2406418
## zero_htn1          1.0022192  2.1107465
## zero_diabetes1     1.2230687  2.3343337
## zero_age.c         0.9388184  0.9970938
exp(coef(hu2))
## count_(Intercept)       count_age.c  zero_(Intercept)         zero_htn1 
##        11.0776281         1.0046622         0.1938479         1.4544520 
##    zero_diabetes1        zero_age.c 
##         1.6896894         0.9675175