## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = TRUE, fig.width = 7, fig.height = 7)
options(width = 116, scipen = 10)
setwd("~/Documents/HSPH/useR.at.HSPH/")
Negative binomial regression is an alternative when there is overdispersion (\( var(Y_i) > E(Y_i) \)).
library(foreign)
dat <- read.dta("http://www.ats.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
id <- factor(id)
})
summary(dat)
id gender math daysabs prog
1001 : 1 female:160 Min. : 1.0 Min. : 0.00 General : 40
1002 : 1 male :154 1st Qu.:28.0 1st Qu.: 1.00 Academic :167
1003 : 1 Median :48.0 Median : 4.00 Vocational:107
1004 : 1 Mean :48.3 Mean : 5.96
1005 : 1 3rd Qu.:70.0 3rd Qu.: 8.00
1006 : 1 Max. :99.0 Max. :35.00
(Other):308
head(dat)
id gender math daysabs prog
1 1001 male 63 4 Academic
2 1002 male 27 4 Academic
3 1003 female 20 2 Academic
4 1004 female 16 3 Academic
5 1005 female 2 3 Academic
6 1006 female 71 13 Academic
library(ggplot2)
ggplot(data = dat, aes(x = daysabs, fill = prog)) +
geom_histogram(binwidth = 1) +
facet_grid(prog ~ ., margin = TRUE, scales = "free")
with(dat, tapply(daysabs, prog, function(x) {
sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
General Academic Vocational
"M (SD) = 10.65 (8.20)" "M (SD) = 6.93 (7.45)" "M (SD) = 2.67 (3.73)"
model.poisson <- glm(daysabs ~ math + prog, family = poisson, data = dat)
summary(model.poisson)
Call:
glm(formula = daysabs ~ math + prog, family = poisson, data = dat)
Deviance Residuals:
Min 1Q Median 3Q Max
-4.260 -2.204 -0.919 0.651 7.423
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.651974 0.060736 43.66 < 2e-16 ***
math -0.006808 0.000931 -7.31 2.6e-13 ***
progAcademic -0.439897 0.056672 -7.76 8.3e-15 ***
progVocational -1.281364 0.077886 -16.45 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2217.7 on 313 degrees of freedom
Residual deviance: 1774.0 on 310 degrees of freedom
AIC: 2665
Number of Fisher Scoring iterations: 5
Goodness of fit for Poisson model Residual deviance > residual degrees of freedom
list(residual.deviance = deviance(model.poisson),
residual.degrees.of.freedom = df.residual(model.poisson),
chisq.p.value = pchisq(deviance(model.poisson), df.residual(model.poisson), lower = F)
)
$residual.deviance
[1] 1774
$residual.degrees.of.freedom
[1] 310
$chisq.p.value
[1] 2.302e-203
require(MASS)
model.negative.binomial <- glm.nb(daysabs ~ math + prog, data = dat)
summary(model.negative.binomial)
Call:
glm.nb(formula = daysabs ~ math + prog, data = dat, init.theta = 1.032713156,
link = log)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.155 -1.019 -0.369 0.229 2.527
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.61527 0.19746 13.24 < 2e-16 ***
math -0.00599 0.00251 -2.39 0.017 *
progAcademic -0.44076 0.18261 -2.41 0.016 *
progVocational -1.27865 0.20072 -6.37 0.00000000019 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Negative Binomial(1.033) family taken to be 1)
Null deviance: 427.54 on 313 degrees of freedom
Residual deviance: 358.52 on 310 degrees of freedom
AIC: 1741
Number of Fisher Scoring iterations: 1
Theta: 1.033
Std. Err.: 0.106
2 x log-likelihood: -1731.258
Goodness of fit for negative binomial model It is better but still significant departure exists?
list(residual.deviance = deviance(model.negative.binomial),
residual.degrees.of.freedom = df.residual(model.negative.binomial),
chisq.p.value = pchisq(deviance(model.negative.binomial), df.residual(model.negative.binomial), lower = F)
)
$residual.deviance
[1] 358.5
$residual.degrees.of.freedom
[1] 310
$chisq.p.value
[1] 0.02994