library(tidyverse)
library(MASS)
library(knitr)
daysAbs <- read.csv("https://raw.githubusercontent.com/RWorkshop/workshopdatasets/master/negbin.csv")
daysAbs <- daysAbs %>% mutate(prog=as.factor(prog))
head(daysAbs) %>% kable()
X | id | gender | math | daysabs | prog |
---|---|---|---|---|---|
1 | 1001 | male | 63 | 4 | Academic |
2 | 1002 | male | 27 | 4 | Academic |
3 | 1003 | female | 20 | 2 | Academic |
4 | 1004 | female | 16 | 3 | Academic |
5 | 1005 | female | 2 | 3 | Academic |
6 | 1006 | female | 71 | 13 | Academic |
summary(daysAbs)
## X id gender math
## Min. : 1.00 Min. :1001 Length:314 Min. : 1.00
## 1st Qu.: 79.25 1st Qu.:1079 Class :character 1st Qu.:28.00
## Median :157.50 Median :1158 Mode :character Median :48.00
## Mean :157.50 Mean :1576 Mean :48.27
## 3rd Qu.:235.75 3rd Qu.:2078 3rd Qu.:70.00
## Max. :314.00 Max. :2157 Max. :99.00
## daysabs prog
## Min. : 0.000 Academic :167
## 1st Qu.: 1.000 General : 40
## Median : 4.000 Vocational:107
## Mean : 5.955
## 3rd Qu.: 8.000
## Max. :35.000
# Checking Dispersion
daysAbs %>% group_by(prog) %>%
summarize(MEAN = mean(daysabs),
VAR = var(daysabs),
SD = sd(daysabs))
## # A tibble: 3 x 4
## prog MEAN VAR SD
## <fct> <dbl> <dbl> <dbl>
## 1 Academic 6.93 55.4 7.45
## 2 General 10.6 67.3 8.20
## 3 Vocational 2.67 13.9 3.73
# Old Code - reporting tool
with(daysAbs, tapply(daysabs, prog, function(x) {
sprintf("Mean (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
## Academic General
## "Mean (SD) = 6.93 (7.45)" "Mean (SD) = 10.65 (8.20)"
## Vocational
## "Mean (SD) = 2.67 (3.73)"
# Comparing Groups
# Overlaid histograms
ggplot(daysAbs, aes(x=daysabs, fill=prog)) +
geom_histogram(binwidth=.5, alpha=.5, position="identity")
# Interleaved histograms
ggplot(daysAbs, aes(x=daysabs, fill=prog)) +
geom_histogram(binwidth=.5, position="dodge")
# Density plots
ggplot(daysAbs, aes(x=daysabs, colour=prog)) +
geom_density(lwd=1.5) +
theme_bw()
# Density plots with semi-transparent fill
ggplot(daysAbs, aes(x=daysabs, fill=prog)) + geom_density(alpha=.3)
summary(m1 <- glm.nb(daysabs ~ math + prog, data = daysAbs))
##
## Call:
## glm.nb(formula = daysabs ~ math + prog, data = daysAbs, init.theta = 1.032713156,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1547 -1.0192 -0.3694 0.2285 2.5273
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.174505 0.133411 16.299 < 2e-16 ***
## math -0.005993 0.002505 -2.392 0.0167 *
## progGeneral 0.440760 0.182610 2.414 0.0158 *
## progVocational -0.837891 0.144087 -5.815 6.06e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.0327) family taken to be 1)
##
## Null deviance: 427.54 on 313 degrees of freedom
## Residual deviance: 358.52 on 310 degrees of freedom
## AIC: 1741.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.033
## Std. Err.: 0.106
##
## 2 x log-likelihood: -1731.258
AIC(m1)
## [1] 1741.258
## math only
m2 <- update(m1, . ~ . - prog)
summary(m2)
##
## Call:
## glm.nb(formula = daysabs ~ math, data = daysAbs, init.theta = 0.8558564931,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0601 -1.1140 -0.3448 0.2499 2.3074
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.24663 0.13916 16.145 < 2e-16 ***
## math -0.01034 0.00259 -3.991 6.58e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.8559) family taken to be 1)
##
## Null deviance: 375.05 on 313 degrees of freedom
## Residual deviance: 357.90 on 312 degrees of freedom
## AIC: 1782.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.8559
## Std. Err.: 0.0829
##
## 2 x log-likelihood: -1776.3060
AIC(m2)
## [1] 1782.306
Favour the model that uses both “math” and “prog”
anova(m1, m2)
## Likelihood ratio tests of Negative Binomial Models
##
## Response: daysabs
## Model theta Resid. df 2 x log-lik. Test df LR stat.
## 1 math 0.8558565 312 -1776.306
## 2 math + prog 1.0327132 310 -1731.258 1 vs 2 2 45.04798
## Pr(Chi)
## 1
## 2 1.65179e-10
m3 <- glm(daysabs ~ math + prog, family = "poisson", data = daysAbs)
summary(m3)
##
## Call:
## glm(formula = daysabs ~ math + prog, family = "poisson", data = daysAbs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2597 -2.2038 -0.9193 0.6511 7.4233
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.212076 0.046292 47.785 < 2e-16 ***
## math -0.006808 0.000931 -7.313 2.62e-13 ***
## progGeneral 0.439897 0.056672 7.762 8.35e-15 ***
## progVocational -0.841467 0.067888 -12.395 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2217.7 on 313 degrees of freedom
## Residual deviance: 1774.0 on 310 degrees of freedom
## AIC: 2665.3
##
## Number of Fisher Scoring iterations: 5
AIC(m3)
## [1] 2665.285
#models are of different object classes, so LRT is harder to implement
pchisq(2 * (logLik(m1) - logLik(m3)), df = 1, lower.tail = FALSE)
## 'log Lik.' 2.157298e-203 (df=5)
(est <- cbind(Estimate = coef(m1), confint(m1)))
## Waiting for profiling to be done...
## Estimate 2.5 % 97.5 %
## (Intercept) 2.174505434 1.92112452 2.437222770
## math -0.005992988 -0.01090086 -0.001066615
## progGeneral 0.440760012 0.09264348 0.810065863
## progVocational -0.837890709 -1.12381766 -0.550113017
exp(est)
## Estimate 2.5 % 97.5 %
## (Intercept) 8.7978329 6.8286331 11.4412217
## math 0.9940249 0.9891583 0.9989340
## progGeneral 1.5538877 1.0970705 2.2480560
## progVocational 0.4326221 0.3250365 0.5768846
newdata1 <- data.frame(math = fivenum(daysAbs$math)[c(1,3,5)],
prog = factor(1:3, levels = 1:3,
labels = levels(daysAbs$prog)))
predict(m1, newdata1)
## 1 2 3
## 2.1685124 2.3276020 0.7433089
newdata1$phat <- predict(m1, newdata1, type = "response")
newdata1
## math prog phat
## 1 1 Academic 8.745265
## 2 48 General 10.253325
## 3 99 Vocational 2.102882
newdata2 <- data.frame(
math = rep(seq(from = min(daysAbs$math),
to = max(daysAbs$math), length.out = 100), 3),
prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
levels(daysAbs$prog)))
newdata2 <- cbind(newdata2, predict(m1, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
DaysAbsent <- exp(fit)
LL <- exp(fit - 1.96 * se.fit)
UL <- exp(fit + 1.96 * se.fit)
})
ggplot(newdata2, aes(math, DaysAbsent)) +
geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
geom_line(aes(colour = prog), size = 2) +
labs(x = "Math Score", y = "Predicted Days Absent")