1 input data

dtahw2 <- read.csv("days_absent.csv")
head(dtahw2)

##      ID Gender  Program Math Days
## 1 S1001   male Academic   63    4
## 2 S1002   male Academic   27    4
## 3 S1003 female Academic   20    2
## 4 S1004 female Academic   16    3
## 5 S1005 female Academic    2    3
## 6 S1006 female Academic   71   13

str(dtahw2)

## 'data.frame':    314 obs. of  5 variables:
##  $ ID     : chr  "S1001" "S1002" "S1003" "S1004" ...
##  $ Gender : chr  "male" "male" "female" "female" ...
##  $ Program: chr  "Academic" "Academic" "Academic" "Academic" ...
##  $ Math   : int  63 27 20 16 2 71 63 3 51 49 ...
##  $ Days   : int  4 4 2 3 3 13 11 7 10 9 ...

2 Days absent from school

library(ggplot2)
ggplot(dtahw2, 
       aes(Days)) +
 geom_histogram(binwidth = 1, fill=8, col=1)+
 facet_grid(Gender ~ Program) +
 labs(x="Days absent", 
      y="Frequency") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

3 means and variance

knitr::kable(aggregate(cbind(Days, Math)~ Gender+Program, data=dtahw2, FUN=mean))

Gender	Program	Days	Math
female	Academic	7.759036	43.69880
male	Academic	6.119048	41.50000
female	General	12.363636	44.40909
male	General	8.555556	45.33333
female	Vocational	2.709091	58.00000
male	Vocational	2.634615	58.84615

knitr::kable(aggregate(cbind(Days, Math)~ Gender+Program, data=dtahw2, FUN=var))

Gender	Program	Days	Math
female	Academic	68.55098	586.7252
male	Academic	41.81698	689.8193
female	General	84.33766	804.5390
male	General	41.67320	923.5294
female	Vocational	10.28418	512.4815
male	Vocational	18.07956	364.9170

4 Effects of Program, gender, and math

ggplot(dtahw2, 
       aes(Math, Days, 
           color=Gender)) +
 stat_smooth(method="glm",
             formula=y ~ x,
             method.args=list(family=poisson),
             size=rel(.5)) +
 geom_point(pch=20, alpha=.5) +
 facet_grid(. ~ Program) +
 labs(y="Days absent", 
      x="Math score") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

5 Parameter estimates

sjPlot::tab_model(m0 <- glm(Days ~ Program*Gender*Math, data=dtahw2, 
                            family=poisson(link=log)), show.se=T, show.r2=F, show.obs=F)

	Days
Predictors	Incidence Rate Ratios	std. Error	CI	p
(Intercept)	10.43	0.08	8.94 – 12.14	<0.001
Program [General]	1.47	0.13	1.13 – 1.91	0.004
Program [Vocational]	0.23	0.25	0.14 – 0.36	<0.001
Gender [male]	0.85	0.11	0.69 – 1.06	0.146
Math	0.99	0.00	0.99 – 1.00	<0.001
Program [General] * Gender [male]	0.97	0.20	0.65 – 1.43	0.863
Program [Vocational] * Gender [male]	1.59	0.37	0.76 – 3.27	0.214
Program [General] * Math	1.00	0.00	1.00 – 1.01	0.479
Program [Vocational] * Math	1.01	0.00	1.00 – 1.02	0.019
Gender [male] * Math	1.00	0.00	0.99 – 1.00	0.274
(Program [General] Gender [male]) Math	1.00	0.00	0.99 – 1.01	0.692
(Program [Vocational] Gender [male]) Math	1.00	0.01	0.98 – 1.01	0.634

6 Goodness of fit and overdispersion

#
1 - pchisq(deviance(m0), df.residual(m0))

## [1] 0

#
performance::check_overdispersion(m0)

## # Overdispersion test
## 
##        dispersion ratio =    6.492
##   Pearson's Chi-Squared = 1960.446
##                 p-value =  < 0.001

## Overdispersion detected.

#
plot(log(fitted(m0)), log((dtahw2$Days-fitted(m0))^2), 
     xlab=expression(hat(mu)),
     ylab=expression((y-hat(mu))^2))
grid()
abline(0, 1, col=2)

7 Dispersion parameter

knitr::kable(as.data.frame(summary(m0, dispersion=6.492)$coef))

	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	2.3450136	0.1989867	11.7847779	0.0000000
ProgramGeneral	0.3873043	0.3411743	1.1352096	0.2562875
ProgramVocational	-1.4878389	0.6257345	-2.3777477	0.0174187
Gendermale	-0.1588403	0.2785265	-0.5702878	0.5684825
Math	-0.0071045	0.0043252	-1.6425893	0.1004679
ProgramGeneral:Gendermale	-0.0347250	0.5144980	-0.0674929	0.9461893
ProgramVocational:Gendermale	0.4617659	0.9469746	0.4876222	0.6258175
ProgramGeneral:Math	0.0019809	0.0071360	0.2775861	0.7813301
ProgramVocational:Math	0.0094842	0.0103312	0.9180121	0.3586125
Gendermale:Math	-0.0026914	0.0062654	-0.4295662	0.6675112
ProgramGeneral:Gendermale:Math	-0.0017088	0.0110050	-0.1552787	0.8766016
ProgramVocational:Gendermale:Math	-0.0029735	0.0159262	-0.1867067	0.8518906

8 Testing for terms in the model with dispersion

drop1(m0, test="F")

## Warning in drop1.glm(m0, test = "F"): F test assumes 'quasipoisson' family

## Single term deletions
## 
## Model:
## Days ~ Program * Gender * Math
##                     Df Deviance    AIC F value Pr(>F)
## <none>                   1730.4 2637.7               
## Program:Gender:Math  2   1730.7 2634.0  0.0274  0.973

9 Selecting a quasipoisson model

sjPlot::tab_model(m1 <- update(m0, . ~ . - Gender:Math:Program, family=quasipoisson), 
                  show.se=T, show.r2=F, show.obs=F)

	Days
Predictors	Incidence Rate Ratios	std. Error	CI	p
(Intercept)	10.27	0.18	7.08 – 14.59	<0.001
Program [General]	1.52	0.28	0.86 – 2.63	0.144
Program [Vocational]	0.24	0.50	0.09 – 0.61	0.005
Gender [male]	0.88	0.24	0.55 – 1.40	0.586
Math	0.99	0.00	0.99 – 1.00	0.084
Program [General] * Gender [male]	0.91	0.30	0.50 – 1.61	0.738
Program [Vocational] * Gender [male]	1.36	0.35	0.68 – 2.71	0.386
Program [General] * Math	1.00	0.01	0.99 – 1.01	0.819
Program [Vocational] * Math	1.01	0.01	0.99 – 1.02	0.296
Gender [male] * Math	1.00	0.00	0.99 – 1.01	0.469

10 Drop all possible single terms to a model

drop1(m1, test='F')

## Single term deletions
## 
## Model:
## Days ~ Program + Gender + Math + Program:Gender + Program:Math + 
##     Gender:Math
##                Df Deviance F value Pr(>F)
## <none>              1730.7               
## Program:Gender  2   1737.5  0.5943 0.5526
## Program:Math    2   1737.9  0.6333 0.5315
## Gender:Math     1   1734.1  0.5952 0.4410

11 Final quasipoisson model

sjPlot::tab_model(m2 <- update(m1, . ~ . - Math:Program, family=quasipoisson), show.se=T, show.r2=F, show.obs=F)

	Days
Predictors	Incidence Rate Ratios	std. Error	CI	p
(Intercept)	9.71	0.16	6.99 – 13.30	<0.001
Program [General]	1.60	0.18	1.10 – 2.27	0.012
Program [Vocational]	0.38	0.24	0.23 – 0.59	<0.001
Gender [male]	0.89	0.24	0.56 – 1.41	0.628
Math	0.99	0.00	0.99 – 1.00	0.097
Program [General] * Gender [male]	0.90	0.30	0.50 – 1.60	0.722
Program [Vocational] * Gender [male]	1.35	0.35	0.68 – 2.67	0.386
Gender [male] * Math	1.00	0.00	0.99 – 1.01	0.440

12 Overdispersion fixed?

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dta_m2 <- broom::augment(m2)
dta_m2 <- dta_m2 %>% mutate(yhat=predict(m2, type="response"),
                            residSqr=residuals(m2, type="response")^2)
ggplot(data=dta_m2, 
       aes(x=yhat, 
           y=residSqr)) +
  geom_point() +
  geom_abline(intercept=0, slope=1) +
  geom_abline(intercept=0, 
              slope=summary(m2)$dispersion,
              color="green") +
  stat_smooth(method="loess",
              formula=y ~ x,
              se=FALSE) +
  labs(x="Fitted values",
       y="Squared residuals") +
  theme_minimal()

# Outliers

library(faraway)
halfnorm(rstudent(m2))
grid()
abline(0, 1)

1026hw2

1 input data

2 Days absent from school

3 means and variance

4 Effects of Program, gender, and math

5 Parameter estimates

6 Goodness of fit and overdispersion

7 Dispersion parameter

8 Testing for terms in the model with dispersion

9 Selecting a quasipoisson model

10 Drop all possible single terms to a model

11 Final quasipoisson model

12 Overdispersion fixed?