# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 7 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Number of Births Example:
# A study was performed to describe the frequency
# of births. Researchers were interested in determining
# whether the number of births to women living in rural
# versus urban areas differed. Researchers were
# able to recruit 49 women to participate in the
# study. Working with the study participants and
# their physicians, research retrospectively collected
# the number births over a 3 or 6 year period.
# Data Set: births
# Data Dictionary:
# (1) ID Subject identifier
# (2) NBirth Number of births
# (3) Rural Location of home (1 = Rural area; 0 = Urban area)
# (4) SurvPeriod Length of retrospective medical review (years)
# (2) Number of Physician Office Visits Example:
# Data obtained from the US National Medical Expenditure
# Survey (NMES) for 2010-2011 data 4406 individuals,
# aged 66 and over, who are covered by Medicare.
# Researchers wanted to examine the relationship
# between demand for medical care as captured by the
# number of physician visits in a year's time and
# the covariates available from the NMES database
# for the patients.
# Data Set: NMES
# Data Dictionary:
# (1) ofp Number of physician office visits 2010-2011
# (2) hosp Number of hospital stays
# (3) health Self-perceived health status
# (poor vs. average vs. excellent)
# (4) numchron Number of chronic conditions
# (5) gender Sex (female vs. male)
# (6) school Number of years of education
# (7) privins Private insurance indicator (yes vs. no)
# load the data files
load(url("http://www.duke.edu/~sgrambow/crp241data/births.RData"))
load(url("http://www.duke.edu/~sgrambow/crp241data/NMES.RData"))
# Packages to load:
# Only install these packages if you have NOT
# already installed them
# install.packages('car') # For qqPlot() function
library(car)
## Loading required package: carData
# install.packages('MASS') # For glm.nb() function
library(MASS)
# install.packages('pscl') # For zeroinfl() function
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
# install.packages("lmtest") # For lrtest() function
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Why doesnt linear regression work well
# for count outcomes?
# - Log-transforming count outcome does NOT result
# in the normality assumption being met
# - This is true for small max potential count outcomes
lm.fit1 <- glm(log(NBirth+0.1) ~ Rural, data=births,
family='gaussian')
summary(lm.fit1)
##
## Call:
## glm(formula = log(NBirth + 0.1) ~ Rural, family = "gaussian",
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1939 -1.6452 0.2040 0.8507 1.7888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6574 0.2789 -2.357 0.0226 *
## Rural 0.5487 0.3828 1.433 0.1584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.788742)
##
## Null deviance: 87.745 on 48 degrees of freedom
## Residual deviance: 84.071 on 47 degrees of freedom
## AIC: 171.51
##
## Number of Fisher Scoring iterations: 2
qqPlot(lm.fit1$residuals)

## 26 4
## 13 15
# - This is true for large max potential count outcomes
lm.fit2 <- glm(log(ofp+0.1) ~ hosp + health + numchron +
gender + school + privins,
data=NMES, family='gaussian')
summary(lm.fit2)
##
## Call:
## glm(formula = log(ofp + 0.1) ~ hosp + health + numchron + gender +
## school + privins, family = "gaussian", data = NMES)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.4881 -0.5386 0.3768 0.9939 3.8637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.256421 0.079146 -3.240 0.00120 **
## hosp 0.248154 0.031357 7.914 3.13e-15 ***
## healthpoor 0.208306 0.073837 2.821 0.00481 **
## healthexcellent -0.370052 0.085710 -4.317 1.61e-05 ***
## numchron 0.306816 0.018186 16.871 < 2e-16 ***
## gendermale -0.252614 0.045989 -5.493 4.18e-08 ***
## school 0.039348 0.006444 6.106 1.11e-09 ***
## privinsyes 0.490977 0.057591 8.525 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.236614)
##
## Null deviance: 11453.7 on 4405 degrees of freedom
## Residual deviance: 9836.6 on 4398 degrees of freedom
## AIC: 16060
##
## Number of Fisher Scoring iterations: 2
qqPlot(lm.fit2$residuals)

## [1] 3735 2903
# What to do if exposure period is not constant?
# - Ignoring different exposure periods
fit1 <- glm(NBirth ~ Rural, data=births, family='poisson')
summary(fit1)
##
## Call:
## glm(formula = NBirth ~ Rural, family = "poisson", data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6870 -1.3513 -0.3749 0.4555 1.7646
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09097 0.21822 -0.417 0.677
## Rural 0.44379 0.27321 1.624 0.104
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 56.033 on 48 degrees of freedom
## Residual deviance: 53.306 on 47 degrees of freedom
## AIC: 138.05
##
## Number of Fisher Scoring iterations: 5
# Including an offset term to account for
# different exposure periods
fit2 <- glm(NBirth ~ offset(log(SurvPeriod)) + Rural,
data=births, family='poisson')
summary(fit2)
##
## Call:
## glm(formula = NBirth ~ offset(log(SurvPeriod)) + Rural, family = "poisson",
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6870 -1.3513 -0.3749 0.4555 1.7646
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8827 0.2182 -8.628 < 2e-16 ***
## Rural 1.1369 0.2732 4.161 3.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 71.563 on 48 degrees of freedom
## Residual deviance: 53.306 on 47 degrees of freedom
## AIC: 138.05
##
## Number of Fisher Scoring iterations: 5
# What to do if the count data is over/under-
# dispersed?
# To get an idea if over/under-disperson
# is present, compute mean and variance
# of # of office visits and compare
mean(NMES$ofp)
## [1] 5.774399
var(NMES$ofp)
## [1] 45.68712
# - Fit a standard Poisson regression model
fit.NMES.poisson = glm(ofp ~ numchron , data=NMES,
family="poisson")
summary(fit.NMES.poisson)
##
## Call:
## glm(formula = ofp ~ numchron, family = "poisson", data = NMES)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.3140 -2.1822 -0.7042 0.8486 17.6593
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.409165 0.010124 139.20 <2e-16 ***
## numchron 0.197902 0.004065 48.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 26943 on 4405 degrees of freedom
## Residual deviance: 24767 on 4404 degrees of freedom
## AIC: 37547
##
## Number of Fisher Scoring iterations: 5
# - Fit a Negative Binomial regression model
# that accounts for over-dispersion
fit.NMES.negbin <- glm.nb(ofp ~ numchron, data = NMES)
summary(fit.NMES.negbin)
##
## Call:
## glm.nb(formula = ofp ~ numchron, data = NMES, init.theta = 1.102561779,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6139 -0.9636 -0.2929 0.2728 4.6616
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.36530 0.02422 56.37 <2e-16 ***
## numchron 0.22309 0.01147 19.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.1026) family taken to be 1)
##
## Null deviance: 5404.3 on 4405 degrees of freedom
## Residual deviance: 5041.5 on 4404 degrees of freedom
## AIC: 24644
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.1026
## Std. Err.: 0.0298
##
## 2 x log-likelihood: -24638.2890
# - Test whether over-dispersion is statistically
# significant by comparing model fits
# Tests whether full model (neg binomial) is
# better fit than reduced model (poisson)
lrtest(fit.NMES.negbin,fit.NMES.poisson)
## Warning in modelUpdate(objects[[i - 1]], objects[[i]]): original model was of
## class "negbin", updated model is of class "glm"
## Likelihood ratio test
##
## Model 1: ofp ~ numchron
## Model 2: ofp ~ numchron
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 3 -12319
## 2 2 -18772 -1 12905 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Can also directly estimate dispersion parameter using
# what is known as a quasi-poisson model
fit.NMES.quasip = glm(ofp ~ numchron ,
data=NMES, family="quasipoisson")
summary(fit.NMES.quasip)
##
## Call:
## glm(formula = ofp ~ numchron, family = "quasipoisson", data = NMES)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.3140 -2.1822 -0.7042 0.8486 17.6593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.40917 0.02755 51.15 <2e-16 ***
## numchron 0.19790 0.01106 17.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 7.405364)
##
## Null deviance: 26943 on 4405 degrees of freedom
## Residual deviance: 24767 on 4404 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
# yields dispersion parameter = 7.405364
# If truly Poisson this should be close to 1
# so it is overdispersed.
# What to do if there are excess 0's in the
# count outcome?
# Fit a Negative Binomial regression model
# that accounts for over-dispersion but
# ignores the excess zeros
# This is the model we fit above
fit.NMES.negbin <- glm.nb(ofp ~ numchron, data = NMES)
summary(fit.NMES.negbin)
##
## Call:
## glm.nb(formula = ofp ~ numchron, data = NMES, init.theta = 1.102561779,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6139 -0.9636 -0.2929 0.2728 4.6616
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.36530 0.02422 56.37 <2e-16 ***
## numchron 0.22309 0.01147 19.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.1026) family taken to be 1)
##
## Null deviance: 5404.3 on 4405 degrees of freedom
## Residual deviance: 5041.5 on 4404 degrees of freedom
## AIC: 24644
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.1026
## Std. Err.: 0.0298
##
## 2 x log-likelihood: -24638.2890
# Fit a Zero-Inflated Negative Binomial regression
# model that accounts for both over-dispersion
# and the excess zeros
zi.fit.NMES.negbin <- zeroinfl(ofp ~ numchron, data = NMES,
dist = "negbin")
summary(zi.fit.NMES.negbin)
##
## Call:
## zeroinfl(formula = ofp ~ numchron, data = NMES, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.1024 -0.7048 -0.2481 0.3309 13.5029
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.48875 0.02708 54.985 < 2e-16 ***
## numchron 0.18055 0.01196 15.099 < 2e-16 ***
## Log(theta) 0.26186 0.03485 7.515 5.71e-14 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5460 0.1236 -12.504 < 2e-16 ***
## numchron -1.7219 0.2983 -5.772 7.82e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 1.2993
## Number of iterations in BFGS optimization: 18
## Log-likelihood: -1.227e+04 on 5 Df
# - Test whether excess of 0's is statistically
# significant by comparing model fits
# Tests whether full model (zero-inflated neg binomial)
# is better fit than reduced model (neg binomial)
vuong(fit.NMES.negbin, zi.fit.NMES.negbin)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -4.735478 model2 > model1 1.0927e-06
## AIC-corrected -4.546087 model2 > model1 2.7326e-06
## BIC-corrected -3.940916 model2 > model1 4.0586e-05
# End of Program