This example continues the coverage of the use of count data models. Instead of using individual survey data, in this example, I use truly aggregate counts. The data consist of county-level counts of deaths and population totals for US counties between the years 1999-2010. These data come from the CDC Wonder Compressed Mortality File Public Use Data Link.
setwd("~/Google Drive//dem7263/data")
library(spdep)
## Loading required package: sp
## Loading required package: Matrix
library(maptools)
## Checking rgeos availability: TRUE
library(car)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(aod)
library(RColorBrewer)
library(MASS)
dat<-readShapePoly("usdata_mort.shp")
dat$deaths<-as.numeric(as.character(dat$Deaths))
dat$population<-as.numeric(as.character(dat$Population))
dat$mortrate<-1000*(dat$deaths/dat$poptot)
spplot(dat, "mortrate", at=quantile(dat$mortrate[is.finite(dat$mortrate)]), col.regions=brewer.pal(n = 6, name = "Blues"))
dat$offset<-dat$poptot+1
Here are the basic Binomial (y,n) model and the basic Poisson models. The correction of the p-values in the Poisson model comes from Prof. German Rodrıguez’s notes Link.
#for the poisson I make an expected value, E_i = r* n_i, r = sum y_i / sum n_i
dat$E<-dat$poptot*(sum(dat$deaths)/sum(dat$poptot))
fit.blm<-glm(cbind(deaths, population)~pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB, data=dat, family=binomial)
fit.plm<-glm(deaths~offset(log(population))+pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB, data=dat@data, family=poisson)
summary(fit.blm)
##
## Call:
## glm(formula = cbind(deaths, population) ~ pblack_1 + phisp +
## ppersonspo + ppopmarrie + DISSWB, family = binomial, data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.054 -3.156 2.113 6.522 81.251
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.842410 0.004782 -1221.70 <2e-16 ***
## pblack_1 -0.143494 0.002858 -50.21 <2e-16 ***
## phisp -1.133323 0.002308 -491.10 <2e-16 ***
## ppersonspo 2.398058 0.006243 384.11 <2e-16 ***
## ppopmarrie 0.994592 0.006657 149.41 <2e-16 ***
## DISSWB 0.520289 0.002401 216.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 816361 on 3066 degrees of freedom
## Residual deviance: 438450 on 3061 degrees of freedom
## AIC: 466298
##
## Number of Fisher Scoring iterations: 4
summary(fit.plm)
##
## Call:
## glm(formula = deaths ~ offset(log(population)) + pblack_1 + phisp +
## ppersonspo + ppopmarrie + DISSWB, family = poisson, data = dat@data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.484 -3.166 2.129 6.560 81.651
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.842025 0.004760 -1227.31 <2e-16 ***
## pblack_1 -0.142869 0.002844 -50.23 <2e-16 ***
## phisp -1.133162 0.002299 -492.98 <2e-16 ***
## ppersonspo 2.395249 0.006207 385.87 <2e-16 ***
## ppopmarrie 0.994743 0.006627 150.11 <2e-16 ***
## DISSWB 0.519935 0.002390 217.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 823082 on 3066 degrees of freedom
## Residual deviance: 442135 on 3061 degrees of freedom
## AIC: 470013
##
## Number of Fisher Scoring iterations: 4
#This should be 1 if the poisson model fits ok. This should equal 1.
fit.plm$deviance/fit.plm$df.residual
## [1] 144.4413
#it's not 1
#correct the se's by hand using the deviance x^2 / residual df
phi<-fit.plm$deviance/(length(dat$deaths)-length(coef(fit.plm)))
sumsp<-summary(fit.plm)$coef
newt<-sumsp[,1]/(sumsp[,2]*sqrt(phi))
newp<-2 * pnorm(-abs(newt))
data.frame(oldt=sumsp[,3],newt=newt, oldp=sumsp[,4],newp=round(newp,5))
## oldt newt oldp newp
## (Intercept) -1227.31443 -102.11985 0 0e+00
## pblack_1 -50.22838 -4.17930 0 3e-05
## phisp -492.97649 -41.01857 0 0e+00
## ppersonspo 385.86749 32.10647 0 0e+00
## ppopmarrie 150.11157 12.49017 0 0e+00
## DISSWB 217.51131 18.09823 0 0e+00
Which doesn’t look like much, but all of the original p-values for the Poisson model assuming no overdispersion were all <2e-16, or 0.
Here are the Negative Binomial and Beta-Binomial models. The NB model is in the MASS library and the Beta-binomial is in aod.
fit.nblm<-glm.nb(deaths~offset(log(population))+pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB, data=dat@data)
summary(fit.nblm)
##
## Call:
## glm.nb(formula = deaths ~ offset(log(population)) + pblack_1 +
## phisp + ppersonspo + ppopmarrie + DISSWB, data = dat@data,
## init.theta = 20.81806751, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9449 -0.6270 -0.0123 0.5665 4.0116
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.85588 0.06544 -89.480 < 2e-16 ***
## pblack_1 -0.00821 0.03893 -0.211 0.833
## phisp -0.80949 0.03576 -22.638 < 2e-16 ***
## ppersonspo 1.86635 0.07401 25.218 < 2e-16 ***
## ppopmarrie 1.52625 0.09315 16.385 < 2e-16 ***
## DISSWB 0.18916 0.02788 6.785 1.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(20.8181) family taken to be 1)
##
## Null deviance: 4167.0 on 3066 degrees of freedom
## Residual deviance: 3137.7 on 3061 degrees of freedom
## AIC: 44064
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 20.818
## Std. Err.: 0.550
##
## 2 x log-likelihood: -44050.245
fit.bblm<-betabin(cbind(deaths, population)~pblack_1+phisp+ppersonspo+ppopmarrie+DISSWB,random=~1, data=dat@data)
summary(fit.bblm)
## Beta-binomial model
## -------------------
## betabin(formula = cbind(deaths, population) ~ pblack_1 + phisp +
## ppersonspo + ppopmarrie + DISSWB, random = ~1, data = dat@data)
##
## Convergence was obtained after 552 iterations.
##
## Fixed-effect coefficients:
## Estimate Std. Error z value Pr(> |z|)
## (Intercept) -6.019e+00 6.267e-02 -9.604e+01 0.000e+00
## pblack_1 1.516e-01 3.640e-02 4.166e+00 3.102e-05
## phisp -8.248e-01 3.858e-02 -2.138e+01 0.000e+00
## ppersonspo 1.731e+00 6.542e-02 2.646e+01 0.000e+00
## ppopmarrie 1.775e+00 8.911e-02 1.992e+01 0.000e+00
## DISSWB 2.213e-01 2.692e-02 8.219e+00 2.220e-16
##
## Overdispersion coefficients:
## Estimate Std. Error z value Pr(> z)
## phi.(Intercept) 4.601e-04 2e-13 0e+00 1e+00
##
## Log-likelihood statistics
## Log-lik nbpar df res. Deviance AIC AICc
## -2.197e+04 7 3060 1.61e+04 4.395e+04 4.395e+04
Examine the AICs of each model:
stats::AIC(fit.blm)
## [1] 466297.8
stats::AIC(fit.plm)
## [1] 470013.4
stats::AIC(fit.nblm)
## [1] 44064.25
AIC(fit.bblm)@istats$AIC
## [1] 43952.26
The beta-binomial model appears to fit the data best, with and AIC of 4.39522610^{4}.
Map the fitted rates
dat$fitted<-1000*predict(fit.bblm, type="response")
plot(mortrate~fitted, dat)
spplot(dat, "fitted", at=quantile(dat$fitted), col.regions=brewer.pal(n = 6, name = "Blues"), main="Fitted Rates")