# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 7 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Number of Births Example: 
# A study was performed to describe the frequency 
# of births. Researchers were interested in determining 
# whether the number of births to women living  in rural 
# versus urban areas differed. Researchers were 
# able to recruit 49 women to participate in the 
# study. Working with the study participants and 
# their physicians, research retrospectively collected 
# the number births over a 3 or 6 year period.
# Data Set: births

# Data Dictionary: 
# (1) ID          Subject identifier 
# (2) NBirth      Number of births 
# (3) Rural       Location of home (1 = Rural area; 0 = Urban area)
# (4) SurvPeriod  Length of retrospective medical review (years) 
# (2) Number of Physician Office Visits Example:
# Data obtained from the US National Medical Expenditure
# Survey (NMES) for 2010-2011 data 4406 individuals, 
# aged 66 and over, who are covered by Medicare. 
# Researchers wanted to examine the relationship
# between demand for medical care as captured by the 
# number of physician visits in a year's time and 
# the covariates available from the NMES database 
# for the patients. 
# Data Set: NMES

# Data Dictionary: 
# (1) ofp           Number of physician office visits 2010-2011
# (2) hosp          Number of hospital stays
# (3) health        Self-perceived health status 
#                    (poor vs. average vs. excellent)
# (4) numchron      Number of chronic conditions
# (5) gender        Sex (female vs. male)
# (6) school        Number of years of education
# (7) privins       Private insurance indicator (yes vs. no)

# load the data files 
load(url("http://www.duke.edu/~sgrambow/crp241data/births.RData"))

load(url("http://www.duke.edu/~sgrambow/crp241data/NMES.RData"))

# Packages to load: 
# Only install these packages if you have NOT 
# already installed them
# install.packages('car')       # For qqPlot() function
library(car)
## Loading required package: carData
# install.packages('MASS')      # For glm.nb() function
library(MASS)
# install.packages('pscl')      # For zeroinfl() function 
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
# install.packages("lmtest")    # For lrtest() function
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Why doesnt linear regression work well 
# for count outcomes?

# - Log-transforming count outcome does NOT result 
#   in the normality assumption being met

# - This is true for small max potential count outcomes 
lm.fit1 <- glm(log(NBirth+0.1) ~ Rural, data=births, 
               family='gaussian')
summary(lm.fit1)
## 
## Call:
## glm(formula = log(NBirth + 0.1) ~ Rural, family = "gaussian", 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1939  -1.6452   0.2040   0.8507   1.7888  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -0.6574     0.2789  -2.357   0.0226 *
## Rural         0.5487     0.3828   1.433   0.1584  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.788742)
## 
##     Null deviance: 87.745  on 48  degrees of freedom
## Residual deviance: 84.071  on 47  degrees of freedom
## AIC: 171.51
## 
## Number of Fisher Scoring iterations: 2
qqPlot(lm.fit1$residuals)

## 26  4 
## 13 15
# - This is true for large max potential count outcomes 
lm.fit2 <- glm(log(ofp+0.1) ~ hosp + health + numchron + 
                 gender + school + privins, 
              data=NMES, family='gaussian')
summary(lm.fit2)
## 
## Call:
## glm(formula = log(ofp + 0.1) ~ hosp + health + numchron + gender + 
##     school + privins, family = "gaussian", data = NMES)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.4881  -0.5386   0.3768   0.9939   3.8637  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.256421   0.079146  -3.240  0.00120 ** 
## hosp             0.248154   0.031357   7.914 3.13e-15 ***
## healthpoor       0.208306   0.073837   2.821  0.00481 ** 
## healthexcellent -0.370052   0.085710  -4.317 1.61e-05 ***
## numchron         0.306816   0.018186  16.871  < 2e-16 ***
## gendermale      -0.252614   0.045989  -5.493 4.18e-08 ***
## school           0.039348   0.006444   6.106 1.11e-09 ***
## privinsyes       0.490977   0.057591   8.525  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.236614)
## 
##     Null deviance: 11453.7  on 4405  degrees of freedom
## Residual deviance:  9836.6  on 4398  degrees of freedom
## AIC: 16060
## 
## Number of Fisher Scoring iterations: 2
qqPlot(lm.fit2$residuals)

## [1] 3735 2903
# What to do if exposure period is not constant?

# - Ignoring different exposure periods
fit1 <- glm(NBirth ~ Rural, data=births, family='poisson')
summary(fit1)
## 
## Call:
## glm(formula = NBirth ~ Rural, family = "poisson", data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6870  -1.3513  -0.3749   0.4555   1.7646  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09097    0.21822  -0.417    0.677
## Rural        0.44379    0.27321   1.624    0.104
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 56.033  on 48  degrees of freedom
## Residual deviance: 53.306  on 47  degrees of freedom
## AIC: 138.05
## 
## Number of Fisher Scoring iterations: 5
# Including an offset term to account for 
# different exposure periods
fit2 <- glm(NBirth ~ offset(log(SurvPeriod)) + Rural, 
            data=births, family='poisson')
summary(fit2)
## 
## Call:
## glm(formula = NBirth ~ offset(log(SurvPeriod)) + Rural, family = "poisson", 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6870  -1.3513  -0.3749   0.4555   1.7646  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8827     0.2182  -8.628  < 2e-16 ***
## Rural         1.1369     0.2732   4.161 3.16e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 71.563  on 48  degrees of freedom
## Residual deviance: 53.306  on 47  degrees of freedom
## AIC: 138.05
## 
## Number of Fisher Scoring iterations: 5
# What to do if the count data is over/under-
# dispersed?

# To get an idea if over/under-disperson 
# is present, compute mean and variance 
# of # of office visits and compare 
mean(NMES$ofp)
## [1] 5.774399
var(NMES$ofp)
## [1] 45.68712
# - Fit a standard Poisson regression model 
fit.NMES.poisson = glm(ofp ~ numchron , data=NMES, 
               family="poisson")
summary(fit.NMES.poisson)
## 
## Call:
## glm(formula = ofp ~ numchron, family = "poisson", data = NMES)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.3140  -2.1822  -0.7042   0.8486  17.6593  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.409165   0.010124  139.20   <2e-16 ***
## numchron    0.197902   0.004065   48.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 26943  on 4405  degrees of freedom
## Residual deviance: 24767  on 4404  degrees of freedom
## AIC: 37547
## 
## Number of Fisher Scoring iterations: 5
# - Fit a Negative Binomial regression model 
# that accounts for over-dispersion 
fit.NMES.negbin <- glm.nb(ofp ~ numchron, data = NMES)
summary(fit.NMES.negbin)
## 
## Call:
## glm.nb(formula = ofp ~ numchron, data = NMES, init.theta = 1.102561779, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6139  -0.9636  -0.2929   0.2728   4.6616  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.36530    0.02422   56.37   <2e-16 ***
## numchron     0.22309    0.01147   19.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.1026) family taken to be 1)
## 
##     Null deviance: 5404.3  on 4405  degrees of freedom
## Residual deviance: 5041.5  on 4404  degrees of freedom
## AIC: 24644
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.1026 
##           Std. Err.:  0.0298 
## 
##  2 x log-likelihood:  -24638.2890
# - Test whether over-dispersion is statistically 
# significant by comparing model fits 
# Tests whether full model (neg binomial) is 
# better fit than reduced model (poisson)
lrtest(fit.NMES.negbin,fit.NMES.poisson)
## Warning in modelUpdate(objects[[i - 1]], objects[[i]]): original model was of
## class "negbin", updated model is of class "glm"
## Likelihood ratio test
## 
## Model 1: ofp ~ numchron
## Model 2: ofp ~ numchron
##   #Df LogLik Df Chisq Pr(>Chisq)    
## 1   3 -12319                        
## 2   2 -18772 -1 12905  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Can also directly estimate dispersion parameter using
# what is known as a quasi-poisson model
fit.NMES.quasip = glm(ofp ~ numchron , 
                      data=NMES, family="quasipoisson")
summary(fit.NMES.quasip)
## 
## Call:
## glm(formula = ofp ~ numchron, family = "quasipoisson", data = NMES)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.3140  -2.1822  -0.7042   0.8486  17.6593  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.40917    0.02755   51.15   <2e-16 ***
## numchron     0.19790    0.01106   17.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 7.405364)
## 
##     Null deviance: 26943  on 4405  degrees of freedom
## Residual deviance: 24767  on 4404  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
# yields dispersion parameter = 7.405364
# If truly Poisson this should be close to 1
# so it is overdispersed.
# What to do if there are excess 0's in the 
# count outcome?

# Fit a Negative Binomial regression model 
# that accounts for over-dispersion but 
# ignores the excess zeros 
# This is the model we fit above
fit.NMES.negbin <- glm.nb(ofp ~ numchron, data = NMES)
summary(fit.NMES.negbin)
## 
## Call:
## glm.nb(formula = ofp ~ numchron, data = NMES, init.theta = 1.102561779, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6139  -0.9636  -0.2929   0.2728   4.6616  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.36530    0.02422   56.37   <2e-16 ***
## numchron     0.22309    0.01147   19.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.1026) family taken to be 1)
## 
##     Null deviance: 5404.3  on 4405  degrees of freedom
## Residual deviance: 5041.5  on 4404  degrees of freedom
## AIC: 24644
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.1026 
##           Std. Err.:  0.0298 
## 
##  2 x log-likelihood:  -24638.2890
# Fit a Zero-Inflated Negative Binomial regression 
# model that accounts for both over-dispersion 
# and the excess zeros 
zi.fit.NMES.negbin <- zeroinfl(ofp ~ numchron, data = NMES, 
                               dist = "negbin")
summary(zi.fit.NMES.negbin)
## 
## Call:
## zeroinfl(formula = ofp ~ numchron, data = NMES, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1024 -0.7048 -0.2481  0.3309 13.5029 
## 
## Count model coefficients (negbin with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.48875    0.02708  54.985  < 2e-16 ***
## numchron     0.18055    0.01196  15.099  < 2e-16 ***
## Log(theta)   0.26186    0.03485   7.515 5.71e-14 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.5460     0.1236 -12.504  < 2e-16 ***
## numchron     -1.7219     0.2983  -5.772 7.82e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 1.2993 
## Number of iterations in BFGS optimization: 18 
## Log-likelihood: -1.227e+04 on 5 Df
# - Test whether excess of 0's is statistically 
# significant by comparing model fits
# Tests whether full model (zero-inflated neg binomial) 
# is better fit than reduced model (neg binomial)
vuong(fit.NMES.negbin, zi.fit.NMES.negbin)
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -4.735478 model2 > model1 1.0927e-06
## AIC-corrected         -4.546087 model2 > model1 2.7326e-06
## BIC-corrected         -3.940916 model2 > model1 4.0586e-05
# End of Program