1 input data

dtahw2 <- read.csv("days_absent.csv")
head(dtahw2)
##      ID Gender  Program Math Days
## 1 S1001   male Academic   63    4
## 2 S1002   male Academic   27    4
## 3 S1003 female Academic   20    2
## 4 S1004 female Academic   16    3
## 5 S1005 female Academic    2    3
## 6 S1006 female Academic   71   13
str(dtahw2)
## 'data.frame':    314 obs. of  5 variables:
##  $ ID     : chr  "S1001" "S1002" "S1003" "S1004" ...
##  $ Gender : chr  "male" "male" "female" "female" ...
##  $ Program: chr  "Academic" "Academic" "Academic" "Academic" ...
##  $ Math   : int  63 27 20 16 2 71 63 3 51 49 ...
##  $ Days   : int  4 4 2 3 3 13 11 7 10 9 ...

2 Days absent from school

library(ggplot2)
ggplot(dtahw2, 
       aes(Days)) +
 geom_histogram(binwidth = 1, fill=8, col=1)+
 facet_grid(Gender ~ Program) +
 labs(x="Days absent", 
      y="Frequency") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

3 means and variance

knitr::kable(aggregate(cbind(Days, Math)~ Gender+Program, data=dtahw2, FUN=mean))
Gender Program Days Math
female Academic 7.759036 43.69880
male Academic 6.119048 41.50000
female General 12.363636 44.40909
male General 8.555556 45.33333
female Vocational 2.709091 58.00000
male Vocational 2.634615 58.84615
knitr::kable(aggregate(cbind(Days, Math)~ Gender+Program, data=dtahw2, FUN=var))
Gender Program Days Math
female Academic 68.55098 586.7252
male Academic 41.81698 689.8193
female General 84.33766 804.5390
male General 41.67320 923.5294
female Vocational 10.28418 512.4815
male Vocational 18.07956 364.9170

4 Effects of Program, gender, and math

ggplot(dtahw2, 
       aes(Math, Days, 
           color=Gender)) +
 stat_smooth(method="glm",
             formula=y ~ x,
             method.args=list(family=poisson),
             size=rel(.5)) +
 geom_point(pch=20, alpha=.5) +
 facet_grid(. ~ Program) +
 labs(y="Days absent", 
      x="Math score") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

5 Parameter estimates

sjPlot::tab_model(m0 <- glm(Days ~ Program*Gender*Math, data=dtahw2, 
                            family=poisson(link=log)), show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.43 0.08 8.94 – 12.14 <0.001
Program [General] 1.47 0.13 1.13 – 1.91 0.004
Program [Vocational] 0.23 0.25 0.14 – 0.36 <0.001
Gender [male] 0.85 0.11 0.69 – 1.06 0.146
Math 0.99 0.00 0.99 – 1.00 <0.001
Program [General] *
Gender [male]
0.97 0.20 0.65 – 1.43 0.863
Program [Vocational] *
Gender [male]
1.59 0.37 0.76 – 3.27 0.214
Program [General] * Math 1.00 0.00 1.00 – 1.01 0.479
Program [Vocational] *
Math
1.01 0.00 1.00 – 1.02 0.019
Gender [male] * Math 1.00 0.00 0.99 – 1.00 0.274
(Program [General]
Gender [male])
Math
1.00 0.00 0.99 – 1.01 0.692
(Program [Vocational]
Gender [male])
Math
1.00 0.01 0.98 – 1.01 0.634

6 Goodness of fit and overdispersion

#
1 - pchisq(deviance(m0), df.residual(m0))
## [1] 0
#
performance::check_overdispersion(m0)
## # Overdispersion test
## 
##        dispersion ratio =    6.492
##   Pearson's Chi-Squared = 1960.446
##                 p-value =  < 0.001
## Overdispersion detected.
#
plot(log(fitted(m0)), log((dtahw2$Days-fitted(m0))^2), 
     xlab=expression(hat(mu)),
     ylab=expression((y-hat(mu))^2))
grid()
abline(0, 1, col=2)

7 Dispersion parameter

knitr::kable(as.data.frame(summary(m0, dispersion=6.492)$coef))
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.3450136 0.1989867 11.7847779 0.0000000
ProgramGeneral 0.3873043 0.3411743 1.1352096 0.2562875
ProgramVocational -1.4878389 0.6257345 -2.3777477 0.0174187
Gendermale -0.1588403 0.2785265 -0.5702878 0.5684825
Math -0.0071045 0.0043252 -1.6425893 0.1004679
ProgramGeneral:Gendermale -0.0347250 0.5144980 -0.0674929 0.9461893
ProgramVocational:Gendermale 0.4617659 0.9469746 0.4876222 0.6258175
ProgramGeneral:Math 0.0019809 0.0071360 0.2775861 0.7813301
ProgramVocational:Math 0.0094842 0.0103312 0.9180121 0.3586125
Gendermale:Math -0.0026914 0.0062654 -0.4295662 0.6675112
ProgramGeneral:Gendermale:Math -0.0017088 0.0110050 -0.1552787 0.8766016
ProgramVocational:Gendermale:Math -0.0029735 0.0159262 -0.1867067 0.8518906

8 Testing for terms in the model with dispersion

drop1(m0, test="F")
## Warning in drop1.glm(m0, test = "F"): F test assumes 'quasipoisson' family
## Single term deletions
## 
## Model:
## Days ~ Program * Gender * Math
##                     Df Deviance    AIC F value Pr(>F)
## <none>                   1730.4 2637.7               
## Program:Gender:Math  2   1730.7 2634.0  0.0274  0.973

9 Selecting a quasipoisson model

sjPlot::tab_model(m1 <- update(m0, . ~ . - Gender:Math:Program, family=quasipoisson), 
                  show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.27 0.18 7.08 – 14.59 <0.001
Program [General] 1.52 0.28 0.86 – 2.63 0.144
Program [Vocational] 0.24 0.50 0.09 – 0.61 0.005
Gender [male] 0.88 0.24 0.55 – 1.40 0.586
Math 0.99 0.00 0.99 – 1.00 0.084
Program [General] *
Gender [male]
0.91 0.30 0.50 – 1.61 0.738
Program [Vocational] *
Gender [male]
1.36 0.35 0.68 – 2.71 0.386
Program [General] * Math 1.00 0.01 0.99 – 1.01 0.819
Program [Vocational] *
Math
1.01 0.01 0.99 – 1.02 0.296
Gender [male] * Math 1.00 0.00 0.99 – 1.01 0.469

10 Drop all possible single terms to a model

drop1(m1, test='F')
## Single term deletions
## 
## Model:
## Days ~ Program + Gender + Math + Program:Gender + Program:Math + 
##     Gender:Math
##                Df Deviance F value Pr(>F)
## <none>              1730.7               
## Program:Gender  2   1737.5  0.5943 0.5526
## Program:Math    2   1737.9  0.6333 0.5315
## Gender:Math     1   1734.1  0.5952 0.4410

11 Final quasipoisson model

sjPlot::tab_model(m2 <- update(m1, . ~ . - Math:Program, family=quasipoisson), show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 9.71 0.16 6.99 – 13.30 <0.001
Program [General] 1.60 0.18 1.10 – 2.27 0.012
Program [Vocational] 0.38 0.24 0.23 – 0.59 <0.001
Gender [male] 0.89 0.24 0.56 – 1.41 0.628
Math 0.99 0.00 0.99 – 1.00 0.097
Program [General] *
Gender [male]
0.90 0.30 0.50 – 1.60 0.722
Program [Vocational] *
Gender [male]
1.35 0.35 0.68 – 2.67 0.386
Gender [male] * Math 1.00 0.00 0.99 – 1.01 0.440

12 Overdispersion fixed?

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dta_m2 <- broom::augment(m2)
dta_m2 <- dta_m2 %>% mutate(yhat=predict(m2, type="response"),
                            residSqr=residuals(m2, type="response")^2)
ggplot(data=dta_m2, 
       aes(x=yhat, 
           y=residSqr)) +
  geom_point() +
  geom_abline(intercept=0, slope=1) +
  geom_abline(intercept=0, 
              slope=summary(m2)$dispersion,
              color="green") +
  stat_smooth(method="loess",
              formula=y ~ x,
              se=FALSE) +
  labs(x="Fitted values",
       y="Squared residuals") +
  theme_minimal()

# Outliers

library(faraway)
halfnorm(rstudent(m2))
grid()
abline(0, 1)