Negative Binomial Regression

library(tidyverse)
library(MASS)
library(knitr)

Number of Days Absent

daysAbs <- read.csv("https://raw.githubusercontent.com/RWorkshop/workshopdatasets/master/negbin.csv")
daysAbs <- daysAbs %>% mutate(prog=as.factor(prog))
head(daysAbs) %>% kable()

X	id	gender	math	daysabs	prog
1	1001	male	63	4	Academic
2	1002	male	27	4	Academic
3	1003	female	20	2	Academic
4	1004	female	16	3	Academic
5	1005	female	2	3	Academic
6	1006	female	71	13	Academic

summary(daysAbs)

##        X                id          gender               math      
##  Min.   :  1.00   Min.   :1001   Length:314         Min.   : 1.00  
##  1st Qu.: 79.25   1st Qu.:1079   Class :character   1st Qu.:28.00  
##  Median :157.50   Median :1158   Mode  :character   Median :48.00  
##  Mean   :157.50   Mean   :1576                      Mean   :48.27  
##  3rd Qu.:235.75   3rd Qu.:2078                      3rd Qu.:70.00  
##  Max.   :314.00   Max.   :2157                      Max.   :99.00  
##     daysabs               prog    
##  Min.   : 0.000   Academic  :167  
##  1st Qu.: 1.000   General   : 40  
##  Median : 4.000   Vocational:107  
##  Mean   : 5.955                   
##  3rd Qu.: 8.000                   
##  Max.   :35.000

Checking Dispersion

# Checking Dispersion

daysAbs %>% group_by(prog) %>% 
  summarize(MEAN  = mean(daysabs),
            VAR = var(daysabs),
            SD = sd(daysabs))

## # A tibble: 3 x 4
##   prog        MEAN   VAR    SD
##   <fct>      <dbl> <dbl> <dbl>
## 1 Academic    6.93  55.4  7.45
## 2 General    10.6   67.3  8.20
## 3 Vocational  2.67  13.9  3.73

# Old Code - reporting tool

with(daysAbs, tapply(daysabs, prog, function(x) {
    sprintf("Mean (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))

##                   Academic                    General 
##  "Mean (SD) = 6.93 (7.45)" "Mean (SD) = 10.65 (8.20)" 
##                 Vocational 
##  "Mean (SD) = 2.67 (3.73)"

Comparing Groups

# Comparing Groups

# Overlaid histograms
ggplot(daysAbs, aes(x=daysabs, fill=prog)) +
    geom_histogram(binwidth=.5, alpha=.5, position="identity")

# Interleaved histograms
ggplot(daysAbs, aes(x=daysabs, fill=prog)) +
    geom_histogram(binwidth=.5, position="dodge")

# Density plots
ggplot(daysAbs, aes(x=daysabs, colour=prog)) + 
  geom_density(lwd=1.5) + 
  theme_bw()

# Density plots with semi-transparent fill
ggplot(daysAbs, aes(x=daysabs, fill=prog)) + geom_density(alpha=.3)

summary(m1 <- glm.nb(daysabs ~ math + prog, data = daysAbs))

## 
## Call:
## glm.nb(formula = daysabs ~ math + prog, data = daysAbs, init.theta = 1.032713156, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1547  -1.0192  -0.3694   0.2285   2.5273  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.174505   0.133411  16.299  < 2e-16 ***
## math           -0.005993   0.002505  -2.392   0.0167 *  
## progGeneral     0.440760   0.182610   2.414   0.0158 *  
## progVocational -0.837891   0.144087  -5.815 6.06e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0327) family taken to be 1)
## 
##     Null deviance: 427.54  on 313  degrees of freedom
## Residual deviance: 358.52  on 310  degrees of freedom
## AIC: 1741.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.033 
##           Std. Err.:  0.106 
## 
##  2 x log-likelihood:  -1731.258

AIC(m1)

## [1] 1741.258

Updated Model

## math only
m2 <- update(m1, . ~ . - prog)
summary(m2)

## 
## Call:
## glm.nb(formula = daysabs ~ math, data = daysAbs, init.theta = 0.8558564931, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0601  -1.1140  -0.3448   0.2499   2.3074  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.24663    0.13916  16.145  < 2e-16 ***
## math        -0.01034    0.00259  -3.991 6.58e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.8559) family taken to be 1)
## 
##     Null deviance: 375.05  on 313  degrees of freedom
## Residual deviance: 357.90  on 312  degrees of freedom
## AIC: 1782.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.8559 
##           Std. Err.:  0.0829 
## 
##  2 x log-likelihood:  -1776.3060

AIC(m2)

## [1] 1782.306

Likelihood Ratio Test

Favour the model that uses both “math” and “prog”

anova(m1, m2)

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: daysabs
##         Model     theta Resid. df    2 x log-lik.   Test    df LR stat.
## 1        math 0.8558565       312       -1776.306                      
## 2 math + prog 1.0327132       310       -1731.258 1 vs 2     2 45.04798
##       Pr(Chi)
## 1            
## 2 1.65179e-10

Poisson Regression

m3 <- glm(daysabs ~ math + prog, family = "poisson", data = daysAbs)
summary(m3)

## 
## Call:
## glm(formula = daysabs ~ math + prog, family = "poisson", data = daysAbs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2597  -2.2038  -0.9193   0.6511   7.4233  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.212076   0.046292  47.785  < 2e-16 ***
## math           -0.006808   0.000931  -7.313 2.62e-13 ***
## progGeneral     0.439897   0.056672   7.762 8.35e-15 ***
## progVocational -0.841467   0.067888 -12.395  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2217.7  on 313  degrees of freedom
## Residual deviance: 1774.0  on 310  degrees of freedom
## AIC: 2665.3
## 
## Number of Fisher Scoring iterations: 5

AIC(m3)

## [1] 2665.285

#models are of different object classes, so LRT is harder to implement

pchisq(2 * (logLik(m1) - logLik(m3)), df = 1, lower.tail = FALSE)

## 'log Lik.' 2.157298e-203 (df=5)

Model Estimates

(est <- cbind(Estimate = coef(m1), confint(m1)))

## Waiting for profiling to be done...

##                    Estimate       2.5 %       97.5 %
## (Intercept)     2.174505434  1.92112452  2.437222770
## math           -0.005992988 -0.01090086 -0.001066615
## progGeneral     0.440760012  0.09264348  0.810065863
## progVocational -0.837890709 -1.12381766 -0.550113017

exp(est)

##                 Estimate     2.5 %     97.5 %
## (Intercept)    8.7978329 6.8286331 11.4412217
## math           0.9940249 0.9891583  0.9989340
## progGeneral    1.5538877 1.0970705  2.2480560
## progVocational 0.4326221 0.3250365  0.5768846

Contrived Data Set

newdata1 <- data.frame(math = fivenum(daysAbs$math)[c(1,3,5)], 
               prog = factor(1:3, levels = 1:3, 
                             labels = levels(daysAbs$prog)))

predict(m1, newdata1)

##         1         2         3 
## 2.1685124 2.3276020 0.7433089

newdata1$phat <- predict(m1, newdata1, type = "response")
newdata1

##   math       prog      phat
## 1    1   Academic  8.745265
## 2   48    General 10.253325
## 3   99 Vocational  2.102882

Another Prediction

newdata2 <- data.frame(
  math = rep(seq(from = min(daysAbs$math), 
                 to = max(daysAbs$math), length.out = 100), 3),
  prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
  levels(daysAbs$prog)))

newdata2 <- cbind(newdata2, predict(m1, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
  DaysAbsent <- exp(fit)
  LL <- exp(fit - 1.96 * se.fit)
  UL <- exp(fit + 1.96 * se.fit)
})

ggplot(newdata2, aes(math, DaysAbsent)) +
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
  geom_line(aes(colour = prog), size = 2) +
  labs(x = "Math Score", y = "Predicted Days Absent")