Analysis of Aggregate Count Data

This example continues the coverage of the use of count data models. Instead of using individual survey data, in this example, I use truly aggregate counts. The data consist of county-level counts of deaths and population totals for US counties between the years 1999-2010. These data come from the CDC Wonder Compressed Mortality File Public Use Data Link.

setwd("~/Google Drive//dem7263/data")
library(spdep)

## Loading required package: sp
## Loading required package: Matrix

library(maptools)

## Checking rgeos availability: TRUE

library(car)
library(lmtest)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(aod)
library(RColorBrewer)
library(MASS)
dat<-readShapePoly("usdata_mort.shp")

dat$deaths<-as.numeric(as.character(dat$Deaths))
dat$population<-as.numeric(as.character(dat$Population))
dat$mortrate<-1000*(dat$deaths/dat$poptot)
spplot(dat, "mortrate", at=quantile(dat$mortrate[is.finite(dat$mortrate)]), col.regions=brewer.pal(n = 6, name = "Blues"))

dat$offset<-dat$poptot+1

Binomial and Poisson models

Here are the basic Binomial (y,n) model and the basic Poisson models. The correction of the p-values in the Poisson model comes from Prof. German Rodrıguez’s notes Link.

#for the poisson I make an expected value, E_i = r* n_i, r = sum y_i / sum n_i
dat$E<-dat$poptot*(sum(dat$deaths)/sum(dat$poptot))
fit.blm<-glm(cbind(deaths, population)~pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB, data=dat, family=binomial)
fit.plm<-glm(deaths~offset(log(population))+pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB, data=dat@data, family=poisson)
summary(fit.blm)

## 
## Call:
## glm(formula = cbind(deaths, population) ~ pblack_1 + phisp + 
##     ppersonspo + ppopmarrie + DISSWB, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -119.054    -3.156     2.113     6.522    81.251  
## 
## Coefficients:
##              Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -5.842410   0.004782 -1221.70   <2e-16 ***
## pblack_1    -0.143494   0.002858   -50.21   <2e-16 ***
## phisp       -1.133323   0.002308  -491.10   <2e-16 ***
## ppersonspo   2.398058   0.006243   384.11   <2e-16 ***
## ppopmarrie   0.994592   0.006657   149.41   <2e-16 ***
## DISSWB       0.520289   0.002401   216.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 816361  on 3066  degrees of freedom
## Residual deviance: 438450  on 3061  degrees of freedom
## AIC: 466298
## 
## Number of Fisher Scoring iterations: 4

summary(fit.plm)

## 
## Call:
## glm(formula = deaths ~ offset(log(population)) + pblack_1 + phisp + 
##     ppersonspo + ppopmarrie + DISSWB, family = poisson, data = dat@data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -119.484    -3.166     2.129     6.560    81.651  
## 
## Coefficients:
##              Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -5.842025   0.004760 -1227.31   <2e-16 ***
## pblack_1    -0.142869   0.002844   -50.23   <2e-16 ***
## phisp       -1.133162   0.002299  -492.98   <2e-16 ***
## ppersonspo   2.395249   0.006207   385.87   <2e-16 ***
## ppopmarrie   0.994743   0.006627   150.11   <2e-16 ***
## DISSWB       0.519935   0.002390   217.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 823082  on 3066  degrees of freedom
## Residual deviance: 442135  on 3061  degrees of freedom
## AIC: 470013
## 
## Number of Fisher Scoring iterations: 4

#This should be 1 if the poisson model fits ok. This should equal 1.
fit.plm$deviance/fit.plm$df.residual

## [1] 144.4413

#it's not 1

#correct the se's by hand using the deviance x^2 / residual df
phi<-fit.plm$deviance/(length(dat$deaths)-length(coef(fit.plm)))
sumsp<-summary(fit.plm)$coef
newt<-sumsp[,1]/(sumsp[,2]*sqrt(phi))
newp<-2 * pnorm(-abs(newt))
data.frame(oldt=sumsp[,3],newt=newt, oldp=sumsp[,4],newp=round(newp,5))

##                    oldt       newt oldp  newp
## (Intercept) -1227.31443 -102.11985    0 0e+00
## pblack_1      -50.22838   -4.17930    0 3e-05
## phisp        -492.97649  -41.01857    0 0e+00
## ppersonspo    385.86749   32.10647    0 0e+00
## ppopmarrie    150.11157   12.49017    0 0e+00
## DISSWB        217.51131   18.09823    0 0e+00

Which doesn’t look like much, but all of the original p-values for the Poisson model assuming no overdispersion were all <2e-16, or 0.

Other Count models

Here are the Negative Binomial and Beta-Binomial models. The NB model is in the MASS library and the Beta-binomial is in aod.

fit.nblm<-glm.nb(deaths~offset(log(population))+pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB, data=dat@data)
summary(fit.nblm)

## 
## Call:
## glm.nb(formula = deaths ~ offset(log(population)) + pblack_1 + 
##     phisp + ppersonspo + ppopmarrie + DISSWB, data = dat@data, 
##     init.theta = 20.81806751, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9449  -0.6270  -0.0123   0.5665   4.0116  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.85588    0.06544 -89.480  < 2e-16 ***
## pblack_1    -0.00821    0.03893  -0.211    0.833    
## phisp       -0.80949    0.03576 -22.638  < 2e-16 ***
## ppersonspo   1.86635    0.07401  25.218  < 2e-16 ***
## ppopmarrie   1.52625    0.09315  16.385  < 2e-16 ***
## DISSWB       0.18916    0.02788   6.785 1.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(20.8181) family taken to be 1)
## 
##     Null deviance: 4167.0  on 3066  degrees of freedom
## Residual deviance: 3137.7  on 3061  degrees of freedom
## AIC: 44064
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  20.818 
##           Std. Err.:  0.550 
## 
##  2 x log-likelihood:  -44050.245

fit.bblm<-betabin(cbind(deaths, population)~pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB,random=~1, data=dat@data)
summary(fit.bblm)

## Beta-binomial model
## -------------------
## betabin(formula = cbind(deaths, population) ~ pblack_1 + phisp + 
##     ppersonspo + ppopmarrie + DISSWB, random = ~1, data = dat@data)
## 
## Convergence was obtained after 552 iterations.
## 
## Fixed-effect coefficients:
##               Estimate Std. Error    z value Pr(> |z|)
## (Intercept) -6.019e+00  6.267e-02 -9.604e+01 0.000e+00
## pblack_1     1.516e-01  3.640e-02  4.166e+00 3.102e-05
## phisp       -8.248e-01  3.858e-02 -2.138e+01 0.000e+00
## ppersonspo   1.731e+00  6.542e-02  2.646e+01 0.000e+00
## ppopmarrie   1.775e+00  8.911e-02  1.992e+01 0.000e+00
## DISSWB       2.213e-01  2.692e-02  8.219e+00 2.220e-16
## 
## Overdispersion coefficients:
##                  Estimate Std. Error z value Pr(> z)
## phi.(Intercept) 4.601e-04      2e-13   0e+00   1e+00
## 
## Log-likelihood statistics
##    Log-lik      nbpar    df res.   Deviance        AIC       AICc 
## -2.197e+04          7       3060   1.61e+04  4.395e+04  4.395e+04

Examine the AICs of each model:

stats::AIC(fit.blm)

## [1] 466297.8

stats::AIC(fit.plm)

## [1] 470013.4

stats::AIC(fit.nblm)

## [1] 44064.25

AIC(fit.bblm)@istats$AIC

## [1] 43952.26

The beta-binomial model appears to fit the data best, with and AIC of 4.39522610^{4}.

Map the fitted rates

dat$fitted<-1000*predict(fit.bblm, type="response")
plot(mortrate~fitted, dat)

spplot(dat, "fitted", at=quantile(dat$fitted), col.regions=brewer.pal(n = 6, name = "Blues"), main="Fitted Rates")

Analysis of Aggregate Count Data

coreysparks

February 23, 2015

Binomial and Poisson models

Other Count models