This example will cover the use of R functions for fitting count data models to complex survey data and to aggregate data at the county level. Specifically, we focus on the Poisson and Negative Binomial models to individual level survey data as well as the Binomial model for aggregate data.

For this example I am using 2014 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link

We will also use data from the CDC Compressed mortality file on mortality in the US.

Poisson model for counts

For the Poisson model, you observe some count of events. The Poisson distribution has only one parameter, \((\lambda)\), which is the average count (y). We have several ways of modeling the Poisson count:

Pure count If each area has the same risk set. This means each observation is exposed to the “process” for the same amount of time, this is the exposure level. If observations do not all have the same level of exposure, then we may express the count as a Rate. If we do this, we need to include an offset term in your model to incorporate unequal risk. I.e. people who are exposed to a disease for a year are more likely to experience the disease than someone who is exposed for a week.

\[log(y)= X' \beta + log(n)\]

, where n is the period of risk. This is called the offset term in the model.

Standardized ratio incorporate differential exposure as an expected count

\[log(y)= X' \beta + log(E)\].

In Epidemiology, the ratio \(\frac{y}{E}\) is referred to as the standardized mortality (or incidence) ratio.

Interpreting the model parameters

All interpretation of parameters is done on a log scale, so

\[exp(\beta) = \text{ % change in the mean}\] , or % change in the rate, for a 1 unit change in X. All testing is done in the same manner.

#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
load("brfss_14.Rdata")

#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss14)
head(nams, n=10)
##  [1] "dispcode" "genhlth"  "physhlth" "menthlth" "poorhlth" "hlthpln1"
##  [7] "persdoc2" "medcost"  "checkup1" "exerany2"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
set.seed(1234)
newnames<-gsub(pattern = "x_",replacement =  "",x =  nams)
names(brfss14)<-tolower(newnames)

#samps<-sample(1:nrow(brfss14), size = 10000, replace = F)
#brfss14<-brfss14[samps,]

Our outcome here is the number of days the respondent reported poor physical health in the past month: Q: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?

brfss14$healthdays<-recode(brfss14$physhlth, recodes = "88=0; 77=NA; 99=NA")
hist(brfss14$healthdays)

summary(brfss14$healthdays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   4.018   3.000  30.000    5205

Other variables:

#race/ethnicity
brfss14$black<-recode(brfss14$racegr3, recodes="2=1; 9=NA; else=0")
brfss14$white<-recode(brfss14$racegr3, recodes="1=1; 9=NA; else=0")
brfss14$other<-recode(brfss14$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss14$hispanic<-recode(brfss14$racegr3, recodes="5=1; 9=NA; else=0")
brfss14$race_eth<-recode(brfss14$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';
                         4='nh multirace'; 5='hispanic'; else=NA", as.factor.result = T)
brfss14$race_eth<-relevel(brfss14$race_eth, ref = "nhwhite")
#insurance
brfss14$ins<-ifelse(brfss14$hlthpln1==1,1,0)

#income grouping
brfss14$inc<-ifelse(brfss14$incomg==9, NA, brfss14$incomg)

#education level
brfss14$educ<-recode(brfss14$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad';
                     5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
#brfss14$educ<-relevel(brfss14$educ, ref='0Prim')

#employment
brfss14$employ<-recode(brfss14$employ, recodes="1:2='Employed'; 2:6='nilf';
                       7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss14$employ<-relevel(brfss14$employ, ref='Employed')

#marital status
brfss14$marst<-recode(brfss14$marital, recodes="1='married'; 2='divorced'; 3='widowed';
                      4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss14$marst<-relevel(brfss14$marst, ref='married')

#Age cut into intervals
brfss14$agec<-cut(brfss14$age80, breaks=c(0,24,39,59,79,99), include.lowest = T)

#Health behaviors
#insurance
brfss14$ins<-recode(brfss14$hlthpln1, recodes = "1=1; 2=0; else=NA" )

#smoking currently
brfss14$smoke<-recode(brfss14$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor.result=T)
brfss14$smoke<-relevel(brfss14$smoke, ref = "NeverSmoked")

Analysis

First, we will subset our data to have complete cases for our variables in our model and make our survey design object

#Here I keep complete cases on my key variables,
#just for speed (the suvey procedures can run for a long time)
brfss14<-brfss14[is.na(brfss14$healthdays)==F&
                     is.na(brfss14$mmsawt)==F&
                     is.na(brfss14$race_eth)==F&
                     is.na(brfss14$educ)==F,]
#FOR THE BRFSS, R GAVE ME A WARNING AND I NEEDED TO ADD:
#YOU MAY NOT NEED TO DO THIS!!!!
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, 
               weights=~mmsawt,
               data = brfss14[is.na(brfss14$mmsawt)==F,])

Poisson regression example

To fit a Poisson GLM to survey data in R, we use the svyglm function in the survey library.

#First I do some simple descriptives
svyhist(~healthdays, des)

svyby(~healthdays, ~race_eth+educ, des, svymean, na.rm=T)
##                           race_eth     educ healthdays         se
## nhwhite.0Prim              nhwhite    0Prim   8.964694 0.51376072
## hispanic.0Prim            hispanic    0Prim   5.755794 0.31605801
## nh black.0Prim            nh black    0Prim   6.241583 0.78322527
## nh multirace.0Prim    nh multirace    0Prim  12.686603 2.64075124
## nh other.0Prim            nh other    0Prim   4.769822 1.14737267
## nhwhite.1somehs            nhwhite  1somehs   6.939912 0.24905798
## hispanic.1somehs          hispanic  1somehs   4.693626 0.27034961
## nh black.1somehs          nh black  1somehs   6.857590 0.44924210
## nh multirace.1somehs  nh multirace  1somehs   6.734925 1.45497768
## nh other.1somehs          nh other  1somehs   4.532884 0.82616391
## nhwhite.2hsgrad            nhwhite  2hsgrad   4.426525 0.07648507
## hispanic.2hsgrad          hispanic  2hsgrad   3.642164 0.17992383
## nh black.2hsgrad          nh black  2hsgrad   4.323454 0.18894780
## nh multirace.2hsgrad  nh multirace  2hsgrad   5.220206 0.55896377
## nh other.2hsgrad          nh other  2hsgrad   2.592871 0.27204302
## nhwhite.3somecol           nhwhite 3somecol   3.878466 0.06557816
## hispanic.3somecol         hispanic 3somecol   3.361901 0.17429979
## nh black.3somecol         nh black 3somecol   3.885501 0.17541101
## nh multirace.3somecol nh multirace 3somecol   4.348112 0.43977679
## nh other.3somecol         nh other 3somecol   3.438963 0.40633895
## nhwhite.4colgrad           nhwhite 4colgrad   2.296173 0.03672220
## hispanic.4colgrad         hispanic 4colgrad   2.791628 0.14122366
## nh black.4colgrad         nh black 4colgrad   2.098004 0.10213285
## nh multirace.4colgrad nh multirace 4colgrad   2.590086 0.26855922
## nh other.4colgrad         nh other 4colgrad   1.741376 0.14246498
svyby(~healthdays, ~agec, des, svymean, na.rm=T)
##            agec healthdays         se
## [0,24]   [0,24]   2.049956 0.07356781
## (24,39] (24,39]   2.621159 0.06208670
## (39,59] (39,59]   4.155285 0.05950617
## (59,79] (59,79]   5.091556 0.07498158
## (79,99] (79,99]   5.641812 0.16514600
#Poisson glm fit to survey data
fit1<-svyglm(healthdays~factor(race_eth)+factor(educ)+factor(agec), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss14[is.na(brfss14$mmsawt) == 
##     F, ])
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.13137    0.05775  19.589  < 2e-16 ***
## factor(race_eth)hispanic     -0.04738    0.02919  -1.623  0.10453    
## factor(race_eth)nh black      0.03339    0.02695   1.239  0.21531    
## factor(race_eth)nh multirace  0.27726    0.05858   4.733 2.21e-06 ***
## factor(race_eth)nh other     -0.15820    0.05819  -2.719  0.00655 ** 
## factor(educ)1somehs           0.02358    0.04874   0.484  0.62850    
## factor(educ)2hsgrad          -0.36288    0.04484  -8.092 5.89e-16 ***
## factor(educ)3somecol         -0.44642    0.04516  -9.885  < 2e-16 ***
## factor(educ)4colgrad         -0.99916    0.04513 -22.140  < 2e-16 ***
## factor(agec)(24,39]           0.34668    0.04328   8.011 1.15e-15 ***
## factor(agec)(39,59]           0.79046    0.03871  20.418  < 2e-16 ***
## factor(agec)(59,79]           0.97350    0.03931  24.764  < 2e-16 ***
## factor(agec)(79,99]           0.99512    0.04692  21.209  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 16.45125)
## 
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
##     factor(race_eth)hispanic     factor(race_eth)nh black 
##                        0.954                        1.034 
## factor(race_eth)nh multirace     factor(race_eth)nh other 
##                        1.320                        0.854 
##          factor(educ)1somehs          factor(educ)2hsgrad 
##                        1.024                        0.696 
##         factor(educ)3somecol         factor(educ)4colgrad 
##                        0.640                        0.368 
##          factor(agec)(24,39]          factor(agec)(39,59] 
##                        1.414                        2.204 
##          factor(agec)(59,79]          factor(agec)(79,99] 
##                        2.647                        2.705

So, we interpret this as follows. NH Multirace respondents had higher numbers of poor health days than NH Whites, while NH Others had a lower number of poor health days. As education increases, the number of poor health days decreases. Also, as age increase, the number of poor health days increases.

In terms of the risk ratios \(exp(\beta)\) NH multirace respondents had 32% higher number of days when their health was poor, compared to NH whites. In practice, this translates into : 4.9008343 days for NH multirace, and 3.7127533 for NH whites.

Overdispersion

When using the Poisson GLM, you often run into overdispersion * What’s overdispersion? For the Poisson distribution, the mean and the variance are functions of one another (variance = mean for Poisson). So when you have more variability than you expect in your data, you have overdispersion. This basically says your data don’t fit your model, and is a problem because overdispersion leads to standard errors for our model parameters that are too small. But, we can fit other models that don’t make such assumptions, or allow there to be more variability.

An easy check on this is to compare the residual deviance to the residual degrees of freedom. They ratio should be 1 if the model fits the data.

NOTE

The svyglm() function includes a scaling term for overdispersion, so this is already taken into account. But if you have data that aren’t a complex survey, we can measure this ourselves using the residual deviance.

fit2<-glm(healthdays~factor(race_eth)+factor(educ)+factor(agec), data=brfss14, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = poisson, data = brfss14)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.0654  -2.9851  -2.3241  -0.4635  11.6175  
## 
## Coefficients:
##                               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                   1.156496   0.008332  138.798  < 2e-16 ***
## factor(race_eth)hispanic      0.052632   0.003743   14.062  < 2e-16 ***
## factor(race_eth)nh black      0.101563   0.003379   30.061  < 2e-16 ***
## factor(race_eth)nh multirace  0.364852   0.007453   48.951  < 2e-16 ***
## factor(race_eth)nh other      0.061359   0.006134   10.002  < 2e-16 ***
## factor(educ)1somehs          -0.025025   0.006236   -4.013 5.99e-05 ***
## factor(educ)2hsgrad          -0.387541   0.005497  -70.499  < 2e-16 ***
## factor(educ)3somecol         -0.463713   0.005544  -83.637  < 2e-16 ***
## factor(educ)4colgrad         -0.964326   0.005607 -171.987  < 2e-16 ***
## factor(agec)(24,39]           0.311156   0.007030   44.264  < 2e-16 ***
## factor(agec)(39,59]           0.801326   0.006446  124.307  < 2e-16 ***
## factor(agec)(59,79]           0.933111   0.006416  145.441  < 2e-16 ***
## factor(agec)(79,99]           1.030352   0.007000  147.183  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2794795  on 232162  degrees of freedom
## Residual deviance: 2643018  on 232150  degrees of freedom
## AIC: 2948869
## 
## Number of Fisher Scoring iterations: 6
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 3.374161

The deviance can also be a test of model fit, using a chi square distribution, with degrees of freedom equal to the residual d.f. (n-p):

1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0

So, this p value is 0, which means the model does not fit the data.

Modeling Overdispersion via a “Quasi” distribution

For the Poisson , we can fit a “quasi” distribution that adds an extra parameter to allow the mean-variance relationship to not be constant. For Poisson we get:

\[var(Y) = \lambda * \phi\], instead of \[var(Y) = λ\]

This allows us to include a rough proxy for a “dispersion parameter” for the distribution. Naturally this is fixed at 1 for basic models, and estimated in the quasi models, we can look to see if is much bigger than 1. If overdispersion is present and not accounted for you could identify a relationship as being significant when it is not!

fit3<-glm(healthdays~factor(race_eth)+factor(educ)+factor(agec), data=brfss14, family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), family = quasipoisson, data = brfss14)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.0654  -2.9851  -2.3241  -0.4635  11.6175  
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.15650    0.03447  33.550  < 2e-16 ***
## factor(race_eth)hispanic      0.05263    0.01548   3.399 0.000676 ***
## factor(race_eth)nh black      0.10156    0.01398   7.266 3.71e-13 ***
## factor(race_eth)nh multirace  0.36485    0.03084  11.832  < 2e-16 ***
## factor(race_eth)nh other      0.06136    0.02538   2.418 0.015616 *  
## factor(educ)1somehs          -0.02503    0.02580  -0.970 0.332027    
## factor(educ)2hsgrad          -0.38754    0.02274 -17.041  < 2e-16 ***
## factor(educ)3somecol         -0.46371    0.02294 -20.217  < 2e-16 ***
## factor(educ)4colgrad         -0.96433    0.02320 -41.573  < 2e-16 ***
## factor(agec)(24,39]           0.31116    0.02908  10.699  < 2e-16 ***
## factor(agec)(39,59]           0.80133    0.02667  30.047  < 2e-16 ***
## factor(agec)(59,79]           0.93311    0.02654  35.156  < 2e-16 ***
## factor(agec)(79,99]           1.03035    0.02896  35.577  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 17.11496)
## 
##     Null deviance: 2794795  on 232162  degrees of freedom
## Residual deviance: 2643018  on 232150  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

Other count models - Negative binomial

  • Of course, we could just fit other distributional models to our data, popular choices are:

  • Negative binomial – Effectively adds a shape parameter to Poisson

Now, R will not fit negative binomial models using survey design, so, we will fit them using sample weights only, then calculate the robust standard errors. We standardize the weights to equal the sample size, as opposed to the population size by dividing each person’s weight by the mean weight.

#First, I define a function to get the clustered, or robust standard errors.
#This function effectively controls for the within-strata homogeneity when
#calculateing the se's for the betas. 

#I stole this from: http://drewdimmery.com/robust-ses-in-r/
#and http://people.su.se/~ma/clustering.pdf
#I also added a correction to use this with the hurdle and zero-inflated models
#This is how stata gets robust se's

clx2 <-   function(fm, dfcw,  cluster){
    # R-codes (www.r-project.org) for computing
    # clustered-standard errors. Mahmood Arai, Jan 26, 2008.
    
    # The arguments of the function are:
    # fitted model, cluster1
    # You need to install libraries `sandwich' and `lmtest'
    
    # reweighting the var-cov matrix for the within model
    require(sandwich);require(lmtest)
    if(class(fm)=="zeroinfl"|class(fm)=="hurdle") {
    M <- length(unique(cluster))   
    N <- length(cluster)           
    K <- dim(fm$vcov)[1]        #here is the rank from the zero inflated fits             
    dfc <- (M/(M-1))*((N-1)/(N-K))  
    uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum, na.rm=T));
    vcovCL <- dfc[1]*sandwich(fm, meat=crossprod(uj)/N)*dfcw #fix a length problem in dfc
    list(summary=coeftest(fm, vcovCL))}
    else if(class(fm)!="zeroinfl"){
    M <- length(unique(cluster))
    N <- length(cluster)
    K <- fm$rank
    dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
    uj <- apply(estfun(fm), 2, function(x) tapply(x, cluster, sum, na.rm=T));
    rcse.cov <- dfc * sandwich(fm, meat = crossprod(uj)/N)
    rcse.se <- coeftest(fm, rcse.cov)
    return(list( rcse.se))}
}

Now, i’ll illustrate that this works:

#Fit poisson, and compare it to the fit from the survey design
#Fit the Poisson GLM
brfss14$wts<-brfss14$mmsawt/mean(brfss14$mmsawt, na.rm=T)
fit.pois<-glm(healthdays~factor(race_eth)+factor(educ)+factor(agec),
              data=brfss14[is.na(brfss14$healthdays)==F,],
              weights=wts, family=poisson)

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
clx2(fit.pois,cluster = brfss14$ststr[is.na(brfss14$healthdays)==F])
## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used
## Warning in if (class(fm) != "zeroinfl") {: the condition has length > 1 and
## only the first element will be used
## [[1]]
## 
## z test of coefficients:
## 
##                               Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)                   1.131367   0.061645  18.3528 < 2.2e-16 ***
## factor(race_eth)hispanic     -0.047376   0.041344  -1.1459  0.251843    
## factor(race_eth)nh black      0.033391   0.032419   1.0300  0.303020    
## factor(race_eth)nh multirace  0.277259   0.050204   5.5226 3.340e-08 ***
## factor(race_eth)nh other     -0.158203   0.050401  -3.1389  0.001696 ** 
## factor(educ)1somehs           0.023583   0.045174   0.5220  0.601643    
## factor(educ)2hsgrad          -0.362876   0.049478  -7.3340 2.233e-13 ***
## factor(educ)3somecol         -0.446416   0.046773  -9.5444 < 2.2e-16 ***
## factor(educ)4colgrad         -0.999160   0.047772 -20.9152 < 2.2e-16 ***
## factor(agec)(24,39]           0.346683   0.044956   7.7117 1.242e-14 ***
## factor(agec)(39,59]           0.790456   0.036060  21.9205 < 2.2e-16 ***
## factor(agec)(59,79]           0.973500   0.046823  20.7911 < 2.2e-16 ***
## factor(agec)(79,99]           0.995117   0.045684  21.7825 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(fit.pois, vcov=vcovHC(fit.pois, type="HC1",cluster="ststr"))
## 
## z test of coefficients:
## 
##                               Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)                   1.131367   0.057771  19.5835 < 2.2e-16 ***
## factor(race_eth)hispanic     -0.047376   0.029195  -1.6227  0.104651    
## factor(race_eth)nh black      0.033391   0.026957   1.2387  0.215468    
## factor(race_eth)nh multirace  0.277259   0.058562   4.7344 2.197e-06 ***
## factor(race_eth)nh other     -0.158203   0.058195  -2.7185  0.006558 ** 
## factor(educ)1somehs           0.023583   0.048754   0.4837  0.628593    
## factor(educ)2hsgrad          -0.362876   0.044853  -8.0904 5.947e-16 ***
## factor(educ)3somecol         -0.446416   0.045173  -9.8824 < 2.2e-16 ***
## factor(educ)4colgrad         -0.999160   0.045134 -22.1376 < 2.2e-16 ***
## factor(agec)(24,39]           0.346683   0.043277   8.0108 1.140e-15 ***
## factor(agec)(39,59]           0.790456   0.038711  20.4194 < 2.2e-16 ***
## factor(agec)(59,79]           0.973500   0.039314  24.7621 < 2.2e-16 ***
## factor(agec)(79,99]           0.995117   0.046918  21.2099 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#same as survey!

summary(fit1)
## 
## Call:
## svyglm(formula = healthdays ~ factor(race_eth) + factor(educ) + 
##     factor(agec), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss14[is.na(brfss14$mmsawt) == 
##     F, ])
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.13137    0.05775  19.589  < 2e-16 ***
## factor(race_eth)hispanic     -0.04738    0.02919  -1.623  0.10453    
## factor(race_eth)nh black      0.03339    0.02695   1.239  0.21531    
## factor(race_eth)nh multirace  0.27726    0.05858   4.733 2.21e-06 ***
## factor(race_eth)nh other     -0.15820    0.05819  -2.719  0.00655 ** 
## factor(educ)1somehs           0.02358    0.04874   0.484  0.62850    
## factor(educ)2hsgrad          -0.36288    0.04484  -8.092 5.89e-16 ***
## factor(educ)3somecol         -0.44642    0.04516  -9.885  < 2e-16 ***
## factor(educ)4colgrad         -0.99916    0.04513 -22.140  < 2e-16 ***
## factor(agec)(24,39]           0.34668    0.04328   8.011 1.15e-15 ***
## factor(agec)(39,59]           0.79046    0.03871  20.418  < 2e-16 ***
## factor(agec)(59,79]           0.97350    0.03931  24.764  < 2e-16 ***
## factor(agec)(79,99]           0.99512    0.04692  21.209  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 16.45125)
## 
## Number of Fisher Scoring iterations: 7
#Fit the Negative Binomial GLM
library(MASS)
fit.nb2<-glm.nb(healthdays~factor(race_eth)+factor(educ)+factor(agec),
              data=brfss14[is.na(brfss14$healthdays)==F,],
              weights=wts)
clx2(fit.nb2,cluster = brfss14$ststr[is.na(brfss14$healthdays)==F])
## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used

## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used
## [[1]]
## 
## z test of coefficients:
## 
##                               Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)                   1.100408   0.061313  17.9475 < 2.2e-16 ***
## factor(race_eth)hispanic     -0.005741   0.041301  -0.1390 0.8894481    
## factor(race_eth)nh black      0.010638   0.034923   0.3046 0.7606515    
## factor(race_eth)nh multirace  0.225431   0.047871   4.7092 2.487e-06 ***
## factor(race_eth)nh other     -0.158556   0.048087  -3.2973 0.0009763 ***
## factor(educ)1somehs           0.049368   0.045236   1.0913 0.2751300    
## factor(educ)2hsgrad          -0.324384   0.049383  -6.5688 5.073e-11 ***
## factor(educ)3somecol         -0.397868   0.046042  -8.6414 < 2.2e-16 ***
## factor(educ)4colgrad         -0.949576   0.049284 -19.2673 < 2.2e-16 ***
## factor(agec)(24,39]           0.344512   0.045096   7.6395 2.180e-14 ***
## factor(agec)(39,59]           0.751957   0.036505  20.5988 < 2.2e-16 ***
## factor(agec)(59,79]           0.969254   0.048517  19.9774 < 2.2e-16 ***
## factor(agec)(79,99]           1.045166   0.042801  24.4192 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
## 
## z test of coefficients:
## 
##                               Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)                   1.100408   0.059259  18.5696 < 2.2e-16 ***
## factor(race_eth)hispanic     -0.005741   0.030555  -0.1879 0.8509624    
## factor(race_eth)nh black      0.010638   0.028818   0.3692 0.7120098    
## factor(race_eth)nh multirace  0.225431   0.057988   3.8876 0.0001013 ***
## factor(race_eth)nh other     -0.158556   0.055907  -2.8361 0.0045673 ** 
## factor(educ)1somehs           0.049368   0.052666   0.9374 0.3485631    
## factor(educ)2hsgrad          -0.324384   0.048377  -6.7053 2.009e-11 ***
## factor(educ)3somecol         -0.397868   0.048732  -8.1644 3.230e-16 ***
## factor(educ)4colgrad         -0.949576   0.048701 -19.4981 < 2.2e-16 ***
## factor(agec)(24,39]           0.344512   0.041360   8.3297 < 2.2e-16 ***
## factor(agec)(39,59]           0.751957   0.036910  20.3727 < 2.2e-16 ***
## factor(agec)(59,79]           0.969254   0.037536  25.8219 < 2.2e-16 ***
## factor(agec)(79,99]           1.045166   0.043797  23.8637 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Aggregate data

Typically when we get data aggregate data, they have a spatial component, meaning they are aggregated to some geography

  • Tract, block, county, ZIP code, etc.

  • These are typically either raw counts

    • Count of deaths, crimes, births, people with some characteristic (eg poverty) > numerator
  • Hopefully with a population at risk > denominator, or rates that someone has calculated

    • Mortality rates from NCHS or some census rate

GLM’s are a class of statistical models that link the mean of the distribution to a linear combination of covariates by some form of link function.

\[g(u) = g(E(Y)) = X ' \beta\]

Where g() is the link function. The link function for a binomial distribution is typically the logit, or:

\[g(u) = log(\frac{\pi}{1- \pi})\]

Modeling options

  • Inherently, we want to see what influences the rate in our areas

    • This is for aggregate data
  • So we want regression models for our area-based counts that properly model the independent variable

  • You have a few options – Binomial, Poisson, Normal, etc

#I subset to only US counties this time, for the sake of computational time.
spdat<-readOGR("/Users/ozd504//Google Drive/classes/dem7263/class17/data/usdata_mort.shp")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/ozd504//Google Drive/classes/dem7263/class17/data/usdata_mort.shp", layer: "usdata_mort"
## with 3067 features
## It has 175 fields
## Integer64 fields read as strings:  pubassist SSI socsec poptot CountyCode
#spdat<-spdat[substr(spdat$COFIPS,1,2)=="48",]
spdat$population<-as.numeric(as.character(spdat$Population))
spdat$deaths<-as.numeric(as.character(spdat$Deaths))
#Create a k=4 nearest neighbor set
us.nb4<-knearneigh(coordinates(spdat), k=4)
us.nb4<-knn2nb(us.nb4)
us.nb4<-make.sym.nb(us.nb4)
us.wt4<-nb2listw(us.nb4, style="W")

Binomial model for counts

To use the Binomial model, you have two options Your observations are 1/0, we have done this before. This says that each area has the event or not OR Your observations are a combination of y and n, where y is a count of some event which has happened to some portion of the population at risk, n. For either of these, you are modeling the probability of observing an event, \(\pi\) or really the logit of the probability: \[log(\frac{\pi}{1- \pi}) = X' \beta\] * The good thing is that you have already done this! * All interpretation of parameters is the same as logistic regression

\[exp(\beta)\]

  • All model testing is the same too!
spdat$std<-fitted(lm(I(deaths/population)~ppopkids+ppop65plus+ppopwrknga-1, spdat))
fit.bin.us<-glm(cbind(deaths, population)~scale(ppersonspo)+scale(p65plus)+scale(pblack_1)+scale(phisp)+I(RUCC>=7), spdat, family = binomial)
summary(fit.bin.us)
## 
## Call:
## glm(formula = cbind(deaths, population) ~ scale(ppersonspo) + 
##     scale(p65plus) + scale(pblack_1) + scale(phisp) + I(RUCC >= 
##     7), family = binomial, data = spdat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -79.567   -1.739    0.970    3.895   42.784  
## 
## Coefficients:
##                     Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)       -4.6538250  0.0004283 -10865.054   <2e-16 ***
## scale(ppersonspo)  0.1001728  0.0004527    221.298   <2e-16 ***
## scale(p65plus)     0.1835270  0.0003114    589.301   <2e-16 ***
## scale(pblack_1)    0.0001039  0.0003830      0.271    0.786    
## scale(phisp)      -0.0905159  0.0003067   -295.087   <2e-16 ***
## I(RUCC >= 7)TRUE   0.0296354  0.0011100     26.697   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 816361  on 3066  degrees of freedom
## Residual deviance: 167335  on 3061  degrees of freedom
## AIC: 195182
## 
## Number of Fisher Scoring iterations: 4
round(exp(coef(fit.bin.us)), 3)
##       (Intercept) scale(ppersonspo)    scale(p65plus)   scale(pblack_1) 
##             0.010             1.105             1.201             1.000 
##      scale(phisp)  I(RUCC >= 7)TRUE 
##             0.913             1.030
lmtest::coeftest(fit.bin.us, vcov.=vcovHC(fit.bin.us, type="HC1"))
## 
## z test of coefficients:
## 
##                      Estimate  Std. Error   z value  Pr(>|z|)    
## (Intercept)       -4.65382502  0.00705478 -659.6697 < 2.2e-16 ***
## scale(ppersonspo)  0.10017275  0.00823192   12.1688 < 2.2e-16 ***
## scale(p65plus)     0.18352698  0.00925747   19.8247 < 2.2e-16 ***
## scale(pblack_1)    0.00010392  0.00557622    0.0186  0.985131    
## scale(phisp)      -0.09051587  0.00609788  -14.8438 < 2.2e-16 ***
## I(RUCC >= 7)TRUE   0.02963539  0.01032582    2.8700  0.004104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1-fit.bin.us$deviance/fit.bin.us$null.deviance
## [1] 0.7950239
#This should be the crude death rate, per 1,000 persons
1000 * mean(fitted(fit.bin.us))
## [1] 9.819147
hist(1000*fitted(fit.bin.us), main="US Crude Death Rate Distribution from Binomial Model")

spdat$fittedbin<-1000*fitted(fit.bin.us)
spplot(spdat, "fittedbin", at=quantile(spdat$fittedbin), col.regions=brewer.pal(5, "Blues"))

Poisson model for counts

For the Poisson model, you observe some count of events in each area (crimes, deaths, etc), and you also have some population at risk in each area. This is generally not the same for each area, but could be, you want a Poisson rate \((\lambda)\), which is really the average count (y). We have several ways of modeling the Poisson count:

Pure count If each area has the same risk set, Rate, include an offset term in your model to incorporate unequal risk

\[log(y)= X' \beta + log(n)\]

, where n is the population at risk in each area. This is called the offset term in the model. Standardized ratio incorporate differential exposure as an expected count

\[log(y)= X' \beta + log(E)\].

Again, all interpretation of parameters is done on a log scale, so

\[exp(\beta) = \text{ % change in the mean}\] , or % change in the rate, for a 1 unit change in X. All testing is done in the same manner.

#Form the expected count
spdat$E<-spdat$population * (sum(spdat$deaths)/sum(spdat$population))
fit.poi.us<-glm(deaths~offset(log(E))+scale(ppersonspo)+scale(p65plus)+scale(pblack_1)+scale(phisp)+I(RUCC>=7), spdat, family = poisson)
summary(fit.poi.us)
## 
## Call:
## glm(formula = deaths ~ offset(log(E)) + scale(ppersonspo) + scale(p65plus) + 
##     scale(pblack_1) + scale(phisp) + I(RUCC >= 7), family = poisson, 
##     data = spdat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -79.884   -1.752    0.991    3.908   43.043  
## 
## Coefficients:
##                     Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)        1.602e-01  4.262e-04  375.777   <2e-16 ***
## scale(ppersonspo)  1.002e-01  4.505e-04  222.432   <2e-16 ***
## scale(p65plus)     1.832e-01  3.093e-04  592.466   <2e-16 ***
## scale(pblack_1)    5.043e-05  3.812e-04    0.132    0.895    
## scale(phisp)      -9.061e-02  3.055e-04 -296.555   <2e-16 ***
## I(RUCC >= 7)TRUE   2.976e-02  1.104e-03   26.953   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 823082  on 3066  degrees of freedom
## Residual deviance: 168839  on 3061  degrees of freedom
## AIC: 196718
## 
## Number of Fisher Scoring iterations: 4
lmtest::coeftest(fit.poi.us, vcov.=vcovHC(fit.poi.us, type="HC1"))
## 
## z test of coefficients:
## 
##                      Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept)        1.6015e-01  7.0463e-03  22.7282 < 2.2e-16 ***
## scale(ppersonspo)  1.0021e-01  8.2205e-03  12.1900 < 2.2e-16 ***
## scale(p65plus)     1.8324e-01  9.2344e-03  19.8437 < 2.2e-16 ***
## scale(pblack_1)    5.0431e-05  5.5724e-03   0.0091  0.992779    
## scale(phisp)      -9.0609e-02  6.0992e-03 -14.8560 < 2.2e-16 ***
## I(RUCC >= 7)TRUE   2.9764e-02  1.0322e-02   2.8836  0.003932 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1-fit.poi.us$deviance/fit.poi.us$null.deviance
## [1] 0.7948695
scale<-sqrt(fit.poi.us$deviance/fit.poi.us$df.residual)
scale
## [1] 7.426855
1-pchisq(fit.poi.us$deviance, df = fit.poi.us$df.residual)
## [1] 0
round(exp(coef(fit.poi.us)), 3)
##       (Intercept) scale(ppersonspo)    scale(p65plus)   scale(pblack_1) 
##             1.174             1.105             1.201             1.000 
##      scale(phisp)  I(RUCC >= 7)TRUE 
##             0.913             1.030
hist(fitted(fit.poi.us)/spdat$E,main="US SMR Distribution from Poisson Model" )

Other 2 - parameter distributions

  • Of course, we could just fit other distributional models to our data, popular choices are:
  • Normal – y has mean and variance
  • Negative binomial – Effectively adds a shape parameter to Poisson
library(MASS)
fit.nb.us<-glm.nb(deaths~offset(log(E))+scale(ppersonspo)+scale(p65plus)+scale(pblack_1)+scale(phisp)+I(RUCC>=7), spdat)
summary(fit.nb.us)
## 
## Call:
## glm.nb(formula = deaths ~ offset(log(E)) + scale(ppersonspo) + 
##     scale(p65plus) + scale(pblack_1) + scale(phisp) + I(RUCC >= 
##     7), data = spdat, init.theta = 65.9681801, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.4349  -0.5437   0.0729   0.6057   5.6327  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.205725   0.003305  62.244  < 2e-16 ***
## scale(ppersonspo)  0.085633   0.002926  29.268  < 2e-16 ***
## scale(p65plus)     0.214550   0.002607  82.298  < 2e-16 ***
## scale(pblack_1)    0.008369   0.002777   3.013  0.00259 ** 
## scale(phisp)      -0.059229   0.002601 -22.769  < 2e-16 ***
## I(RUCC >= 7)TRUE  -0.025314   0.005338  -4.742 2.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(65.9682) family taken to be 1)
## 
##     Null deviance: 12374.2  on 3066  degrees of freedom
## Residual deviance:  3164.4  on 3061  degrees of freedom
## AIC: 40724
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  65.97 
##           Std. Err.:  1.85 
## 
##  2 x log-likelihood:  -40709.96
 1-fit.nb.us$deviance/fit.nb.us$null.deviance
## [1] 0.7442778
lmtest::coeftest(fit.nb.us, vcov.=vcovHC(fit.nb.us, type="HC5"))
## 
## z test of coefficients:
## 
##                     Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)        0.2057253  0.0034945  58.8709 < 2.2e-16 ***
## scale(ppersonspo)  0.0856332  0.0035158  24.3569 < 2.2e-16 ***
## scale(p65plus)     0.2145501  0.0043119  49.7578 < 2.2e-16 ***
## scale(pblack_1)    0.0083686  0.0027369   3.0576  0.002231 ** 
## scale(phisp)      -0.0592291  0.0034669 -17.0841 < 2.2e-16 ***
## I(RUCC >= 7)TRUE  -0.0253141  0.0062090  -4.0770 4.562e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
round(exp(coef(fit.nb.us)), 3)
##       (Intercept) scale(ppersonspo)    scale(p65plus)   scale(pblack_1) 
##             1.228             1.089             1.239             1.008 
##      scale(phisp)  I(RUCC >= 7)TRUE 
##             0.942             0.975
hist(fitted(fit.nb.us)/spdat$E,main="US SMR Distribution from Negative Binomial Model" )