Poisson Regression using Bayesian Methods

The Poisson distribution provides a natural likelihood for count data. For an example of Poisson regression, we’ll use the badhealth data set from the COUNT package in R.

library("COUNT")

## Loading required package: msme

## Loading required package: MASS

## Loading required package: lattice

## Loading required package: sandwich

data("badhealth")
?badhealth
head(badhealth)

##   numvisit badh age
## 1       30    0  58
## 2       20    0  54
## 3       16    0  44
## 4       20    0  57
## 5       15    0  33
## 6       15    0  28

any(is.na(badhealth))

## [1] FALSE

# Visualizations
hist(badhealth$numvisit, breaks=20)

plot(jitter(log(numvisit)) ~ jitter(age), data=badhealth, subset=badh==0, xlab="age", ylab="log(visits)")
points(jitter(log(numvisit)) ~ jitter(age), data=badhealth, subset=badh==1, col="red")

Model

It appears that both age and bad health are related to the number of doctor visits. We should include model terms for both variables. If we believe the age/visits relationship is different between healthy and non-healthy populations, we should also include an interaction term. We will fit the full model here and leave it to you to compare it with the simpler additive model.

library("rjags")

## Loading required package: coda

## Linked to JAGS 4.3.2

## Loaded modules: basemod,bugs

mod_string = " model {
    for (i in 1:length(numvisit)) {
        numvisit[i] ~ dpois(lam[i])
        log(lam[i]) = int + b_badh*badh[i] + b_age*age[i] + b_intx*age[i]*badh[i]
    }
    
    int ~ dnorm(0.0, 1.0/1e6)
    b_badh ~ dnorm(0.0, 1.0/1e4)
    b_age ~ dnorm(0.0, 1.0/1e4)
    b_intx ~ dnorm(0.0, 1.0/1e4)
} "

set.seed(102)

data_jags = as.list(badhealth)

params = c("int", "b_badh", "b_age", "b_intx")

mod = jags.model(textConnection(mod_string), data=data_jags, n.chains=3)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 1127
##    Unobserved stochastic nodes: 4
##    Total graph size: 3665
## 
## Initializing model

update(mod, 1e3)

mod_sim = coda.samples(model=mod,
                        variable.names=params,
                        n.iter=5e3)
mod_csim = as.mcmc(do.call(rbind, mod_sim))

## convergence diagnostics
plot(mod_sim)

gelman.diag(mod_sim)

## Potential scale reduction factors:
## 
##        Point est. Upper C.I.
## b_age        1.00       1.01
## b_badh       1.02       1.06
## b_intx       1.02       1.05
## int          1.00       1.01
## 
## Multivariate psrf
## 
## 1.02

autocorr.diag(mod_sim)

##            b_age    b_badh    b_intx       int
## Lag 0  1.0000000 1.0000000 1.0000000 1.0000000
## Lag 1  0.9599399 0.9660635 0.9678120 0.9548237
## Lag 5  0.8476570 0.8706301 0.8770659 0.8409390
## Lag 10 0.7284381 0.7682832 0.7785764 0.7228002
## Lag 50 0.1879316 0.2949686 0.2989565 0.1855142

autocorr.plot(mod_sim)

effectiveSize(mod_sim)

##    b_age   b_badh   b_intx      int 
## 228.0818 197.0197 187.8654 234.1566

## compute DIC
dic = dic.samples(mod, n.iter=1e3)
dic

## Mean deviance:  5630 
## penalty 3.955 
## Penalized deviance: 5634

Model checking

To get a general idea of the model’s performance, we can look at predicted values and residuals as usual. Don’t forget that we must apply the inverse of the link function to get predictions for \(\lambda\).

X = as.matrix(badhealth[,-1])
X = cbind(X, with(badhealth, badh*age))
head(X)

##      badh age  
## [1,]    0  58 0
## [2,]    0  54 0
## [3,]    0  44 0
## [4,]    0  57 0
## [5,]    0  33 0
## [6,]    0  28 0

pmed_coef = apply(mod_csim, 2, median)
llam_hat = pmed_coef["int"] + X %*% pmed_coef[c("b_badh", "b_age", "b_intx")]
lam_hat = exp(llam_hat)

hist(lam_hat)

resid = badhealth$numvisit - lam_hat
plot(resid) # the data were ordered

plot(lam_hat, badhealth$numvisit)
abline(0.0, 1.0)

plot(lam_hat[which(badhealth$badh==0)], resid[which(badhealth$badh==0)], xlim=c(0, 8), ylab="residuals", xlab=expression(hat(lambda)), ylim=range(resid))
points(lam_hat[which(badhealth$badh==1)], resid[which(badhealth$badh==1)], col="red")

It is not surprising that the variability increases for values predicted at higher values since the mean is also the variance in the Poisson distribution. However, observations predicted to have about \(2\) visits should have variance about \(2\), and observations predicted to have about \(6\) visits should have variance about \(6\).

var(resid[which(badhealth$badh==0)])

## [1] 7.02254

var(resid[which(badhealth$badh==1)])

## [1] 41.19617

Clearly this is not the case with these data. This indicates that either the model fits poorly (meaning the covariates don’t explain enough of the variability in the data), or the data are “overdispersed” for the Poisson likelihood we have chosen. This is a common issue with count data. If the data are more variable than the Poisson likelihood would suggest, a good alternative is the negative binomial distribution, which we will not pursue here.

Results

Assuming the model fit is adequate, we can interpret the results.

summary(mod_sim)

## 
## Iterations = 2001:7000
## Thinning interval = 1 
## Number of chains = 3 
## Sample size per chain = 5000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##             Mean       SD  Naive SE Time-series SE
## b_age   0.008461 0.002073 1.693e-05      0.0001385
## b_badh  1.567785 0.184000 1.502e-03      0.0133528
## b_intx -0.010896 0.004265 3.482e-05      0.0003209
## int     0.347898 0.080716 6.590e-04      0.0053300
## 
## 2. Quantiles for each variable:
## 
##             2.5%      25%       50%       75%     97.5%
## b_age   0.004371  0.00708  0.008476  0.009875  0.012580
## b_badh  1.198800  1.44669  1.565278  1.690130  1.931918
## b_intx -0.019352 -0.01373 -0.010781 -0.008107 -0.002292
## int     0.187831  0.29282  0.347602  0.401801  0.505817

The intercept is not necessarily interpretable here because it corresponds to a healthy 0-year-old, whereas the youngest person in the data set is 20 years old.

For healthy individuals, it appears that age has a positive association with number of doctor visits. Clearly, bad health is associated with an increase in expected number of visits. The interaction coefficient is interpreted as an adjustment to the age coefficient for people in bad health. Hence, for people with bad health, age is essentially unassociated with number of visits.

Predictive distributions

Let’s say we have two people aged 35, one in good health and the other in poor health. What is the posterior probability that the individual with poor health will have more doctor visits? This goes beyond the posterior probabilities we have calculated comparing expected responses in previous lessons. Here we will create Monte Carlo samples for the responses themselves. This is done by taking the Monte Carlo samples of the model parameters, and for each of those, drawing a sample from the likelihood.

Let’s walk through this.

First, we need the \(x\) values for each individual. We’ll say the healthy one is Person 1 and the unhealthy one is Person 2. Their \(x\) values are:

x1 = c(0, 35, 0) # good health
x2 = c(1, 35, 35) # bad health

The posterior samples of the model parameters are stored in mod_csim:

head(mod_csim)

## Markov Chain Monte Carlo (MCMC) output:
## Start = 1 
## End = 7 
## Thinning interval = 1 
##            b_age   b_badh      b_intx       int
## [1,] 0.005976906 1.716440 -0.01404186 0.4397225
## [2,] 0.006239267 1.706479 -0.01324466 0.4505588
## [3,] 0.006053286 1.733207 -0.01305659 0.4102716
## [4,] 0.007390254 1.684388 -0.01243699 0.3974524
## [5,] 0.007327578 1.689241 -0.01309554 0.3501475
## [6,] 0.007494724 1.678602 -0.01285265 0.3894632
## [7,] 0.007240224 1.679091 -0.01376702 0.3940488

First, we’ll compute the linear part of the predictor:

loglam1 = mod_csim[,"int"] + mod_csim[,c(2,1,3)] %*% x1
loglam2 = mod_csim[,"int"] + mod_csim[,c(2,1,3)] %*% x2

Next we’ll apply the inverse link:

lam1 = exp(loglam1)
lam2 = exp(loglam2)

The final step is to use these samples for the \(\lambda\) parameter for each individual and simulate actual number of doctor visits using the likelihood:

(n_sim = length(lam1))

## [1] 15000

y1 = rpois(n=n_sim, lambda=lam1)
y2 = rpois(n=n_sim, lambda=lam2)

plot(table(factor(y1, levels=0:18))/n_sim, pch=2, ylab="posterior prob.", xlab="visits")
points(table(y2+0.1)/n_sim, col="red")

Finally, we can answer the original question: What is the probability that the person with poor health will have more doctor visits than the person with good health?

mean(y2 > y1)

## [1] 0.9187333

Because we used our posterior samples for the model parameters in our simulation, this posterior predictive distribution on the number of visits for these two new individuals naturally takes into account our uncertainty in the model estimates. This is a more honest/realistic distribution than we would get if we had fixed the model parameters at their MLE or posterior means and simulated data for the new individuals.

Quiz Questions

exp(1.5+(-.3*.8)+(1.0)*(1.2))

## [1] 11.70481

library("rjags")

mod_string = " model {
    for (i in 1:length(numvisit)) {
        numvisit[i] ~ dpois(lam[i])
        log(lam[i]) = int + b_badh*badh[i] + b_age*age[i] 
    }
    
    int ~ dnorm(0.0, 1.0/1e6)
    b_badh ~ dnorm(0.0, 1.0/1e4)
    b_age ~ dnorm(0.0, 1.0/1e4)
} "

set.seed(102)

data_jags = as.list(badhealth)

params = c("int", "b_badh", "b_age")

mod = jags.model(textConnection(mod_string), data=data_jags, n.chains=3)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 1127
##    Unobserved stochastic nodes: 3
##    Total graph size: 3587
## 
## Initializing model

update(mod, 1e3)

mod_sim = coda.samples(model=mod,
                        variable.names=params,
                        n.iter=5e3)
mod_csim = as.mcmc(do.call(rbind, mod_sim))

## convergence diagnostics
plot(mod_sim)

gelman.diag(mod_sim)

## Potential scale reduction factors:
## 
##        Point est. Upper C.I.
## b_age           1       1.01
## b_badh          1       1.01
## int             1       1.01
## 
## Multivariate psrf
## 
## 1

autocorr.diag(mod_sim)

##             b_age      b_badh        int
## Lag 0  1.00000000 1.000000000 1.00000000
## Lag 1  0.94697285 0.478417753 0.94478068
## Lag 5  0.78562735 0.050001163 0.78167953
## Lag 10 0.62015175 0.014702884 0.61761924
## Lag 50 0.02067701 0.005194179 0.02338487

autocorr.plot(mod_sim)

effectiveSize(mod_sim)

##     b_age    b_badh       int 
##  356.0197 5039.8301  360.3745

## compute DIC
dic = dic.samples(mod, n.iter=1e3)
dic

## Mean deviance:  5636 
## penalty 2.883 
## Penalized deviance: 5638

In the previous course, we briefly discussed Poisson processes. The mean of a Poisson distribution can be thought of as the rate at which the events we count are occuring. Hence, it is natural to imagin that if we are observing for twice as long, we would expect to count about twice as many events (assuming the rate is steady). If \(t\) is the amount of time that we observe, and \(\lambda\) is the rate of events per unit of time, then the expected number of events is \(t\lambda\) and the distribution of the number of events in this time interval is \(Poisson(t\lambda)\)

Suppose that a retail store receives an average of 15 customer calls per hour and that the calls approximately follow a Poisson process. If we monitor calls for two hours, what is the probability that there will be fewer than 22 calls in this time period?

ppois(21,30,lower.tail=TRUE)

## [1] 0.0544434

On average, this retailer receives 0.01 calls per customer per day. They notice, however, that one particular group of customers tens to call more frequently.

To test this, they select 90 days to monitor 224 customers, 24 of which belong to this group (call it group 2). Not all customers had accounts for the full 90 day period, but we do know how many of the 90 days each was active. We also have the age of the customer, the group to which the customer belongs, and how may calls the customer placed during the period they were active. The data are attached as callers.csv.

Try plotting some the variables to understand some of the relationships. If one of the variables is categorical, a box plot is a good choice.

Which of the following plots would be most useful to the retailer to informally explore their hypothesis that customers from group 2 call at a higher rate than the other customers?

dat = read.csv(file="callers.csv", header=TRUE)
head(dat)

##   calls days_active isgroup2 age
## 1     2          32        0  27
## 2     4          81        0  32
## 3     0          41        0  22
## 4     1          36        0  28
## 5     0          55        0  31
## 6     0          25        0  33

  ## set R's working directory to the same directory
  ## as this file, or use the full path to the file.
boxplot(calls/days_active~isgroup2, data=dat)

Fit the model in JAGS using \(N(0,10^2)\) priors for the intercept and both coefficients. Be sure to check for MCMC convergence and examine the residuals. Also don’t forget to multiply lam_hat by days_active to obtain the model’s predicted mean number of calls.

What is the posterior probability that \(\beta_2\), the coefficient for the indicator isgroup2 is \(>0\)?

library("rjags")

mod2_string = " model {

    for(i in 1:length(calls)) {
        calls[i] ~ dpois(days_active[i] * lam[i])
        log(lam[i]) = b0 + b[1]*age[i] + b[2]*isgroup2[i]
    }
    b0 ~ dnorm(0.0, 1/1e2)
    for (j in 1:2) {
      b[j] ~ dnorm(0.0, 1.0/1e2)
    }
} "

set.seed(102)

data_jags = as.list(dat)

params = c("b0","b")

mod2 = jags.model(textConnection(mod2_string), data=data_jags, n.chains=3)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 224
##    Unobserved stochastic nodes: 3
##    Total graph size: 1218
## 
## Initializing model

update(mod2, 1e3)

mod2_sim = coda.samples(model=mod2,
                        variable.names=params,
                        n.iter=5e3)
mod2_csim = as.mcmc(do.call(rbind, mod2_sim))

## convergence diagnostics
plot(mod2_sim)

gelman.diag(mod2_sim)

## Potential scale reduction factors:
## 
##      Point est. Upper C.I.
## b[1]       1.02       1.05
## b[2]       1.00       1.00
## b0         1.02       1.06
## 
## Multivariate psrf
## 
## 1.01

autocorr.diag(mod2_sim)

##             b[1]          b[2]        b0
## Lag 0  1.0000000  1.0000000000 1.0000000
## Lag 1  0.9779128  0.4900055705 0.9782768
## Lag 5  0.9084320  0.0369643653 0.9103901
## Lag 10 0.8316516 -0.0006136139 0.8328731
## Lag 50 0.4113010 -0.0194587312 0.4140422

autocorr.plot(mod2_sim)

effectiveSize(mod2_sim)

##      b[1]      b[2]        b0 
##  141.0317 4945.3435  135.3503

## compute DIC
dic = dic.samples(mod2, n.iter=1e3)
dic

## Mean deviance:  479.5 
## penalty 2.766 
## Penalized deviance: 482.3

# Calculate posterior probability that 

pmod2_coef = apply(mod2_csim, 2, mean)

X = as.matrix(dat[,-c(1,2)])
X = X[,c(2,1)]

llam_hat3 = pmod2_coef['b0'] + X %*% pmod2_coef[-3]
lam_hat3 = llam_hat3*dat$days_active

#posterior probability that beta the coefficient is greater than 0?
# beta parameter value is in second column of mod_csim:
mean(mod2_csim[,2] > 0)

## [1] 1