In Class Exercise 1

The data are the responses of a sample of 150 individuals to 9 multiple-choice items from a General Certificate of Education O-level mathematics paper containing 60 items in all. The responses are coded 1 for correct answer and 0 for incorrect. The items appear to test ability in two-dimensional Euclidean geometry. The data have one column with 1,350 records ordered by item within individuals. There are no missing data.

Source: Goldstein, H., & Wood, R. (1989). Five decades of item response modelling, British Journal of Mathematical and Statistical Psychology, 42, 139-67. - Column 1: Subject ID - Column 2: Item ID - Column 3: Answer, 1 = Correct, 0 = Incorrect

pacman::p_load(tidyverse, ggplot2,ltm, eRm, lme4)
dta1<-read.table("starter.txt", h=T)
names(dta1)<-c("ID", "Item", "Resp")
head(dta1)
##   ID Item Resp
## 1  1    1    1
## 2  1    2    1
## 3  1    3    1
## 4  1    4    1
## 5  1    5    1
## 6  1    6    1
str(dta1)
## 'data.frame':    1350 obs. of  3 variables:
##  $ ID  : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Item: int  1 2 3 4 5 6 7 8 9 1 ...
##  $ Resp: int  1 1 1 1 1 1 1 1 0 1 ...
dta1_w<-spread(dta1, key="Item", value = "Resp")
names(dta1_w)<-c("ID", paste0("Item", 1:9))
head(dta1_w)
##   ID Item1 Item2 Item3 Item4 Item5 Item6 Item7 Item8 Item9
## 1  1     1     1     1     1     1     1     1     1     0
## 2  2     1     1     1     0     0     0     1     0     0
## 3  3     1     1     1     1     0     1     1     0     1
## 4  4     1     1     1     1     1     0     1     0     0
## 5  5     1     1     1     0     0     1     1     0     0
## 6  6     0     1     1     1     0     0     0     0     0

Correlated Binary Responses

dta1_w<-dta1_w[,-1]
knitr::kable(cor(dta1_w))
Item1 Item2 Item3 Item4 Item5 Item6 Item7 Item8 Item9
Item1 1.0000000 0.0982946 0.0542415 0.0444554 0.1769309 0.0611080 0.1176913 0.0236666 0.0521286
Item2 0.0982946 1.0000000 0.2069345 0.2261335 0.3180732 0.1572070 0.1561738 0.2742122 0.0353553
Item3 0.0542415 0.2069345 1.0000000 0.2380182 0.2442222 0.2247635 0.1659775 0.1390151 0.0541944
Item4 0.0444554 0.2261335 0.2380182 1.0000000 0.1874462 0.1357355 0.1883526 0.2601335 0.2025407
Item5 0.1769309 0.3180732 0.2442222 0.1874462 1.0000000 0.1326535 0.1399923 0.3144546 0.0715628
Item6 0.0611080 0.1572070 0.2247635 0.1357355 0.1326535 1.0000000 0.1993889 0.0550560 0.1515848
Item7 0.1176913 0.1561738 0.1659775 0.1883526 0.1399923 0.1993889 1.0000000 0.0898275 0.1840525
Item8 0.0236666 0.2742122 0.1390151 0.2601335 0.3144546 0.0550560 0.0898275 1.0000000 0.1702513
Item9 0.0521286 0.0353553 0.0541944 0.2025407 0.0715628 0.1515848 0.1840525 0.1702513 1.0000000
knitr::kable(descript(dta1_w)$perc[c(1, 7, 2, 3, 4, 5, 6, 8, 9),])
0 1 logit
Item1 0.0800000 0.9200000 2.4423470
Item7 0.1800000 0.8200000 1.5163475
Item2 0.2000000 0.8000000 1.3862944
Item3 0.2533333 0.7466667 1.0809127
Item4 0.2666667 0.7333333 1.0116009
Item5 0.3066667 0.6933333 0.8157495
Item6 0.3200000 0.6800000 0.7537718
Item8 0.4600000 0.5400000 0.1603427
Item9 0.6666667 0.3333333 -0.6931472

Plots

plot(descript(dta1_w), 
     ylim=c(0,1), 
     type='b',
     main="Item response curves")
grid()

The Rasch model

dtaL <- dta1 %>% mutate(Item=factor(Item)) #Factor Items

sjPlot::tab_model(m0 <- glmer(Resp ~ -1 + Item + (1 | ID), data=dtaL,
                              family=binomial),
                  show.obs=F, show.ngroups=F,transform=NULL, show.se=T, show.r2=F,show.icc=F)
  Resp
Predictors Log-Odds std. Error CI p
Item [1] 2.90 0.34 2.24 – 3.56 <0.001
Item [2] 1.70 0.24 1.22 – 2.17 <0.001
Item [3] 1.35 0.23 0.90 – 1.80 <0.001
Item [4] 1.24 0.22 0.80 – 1.69 <0.001
Item [5] 1.02 0.22 0.60 – 1.45 <0.001
Item [6] 0.94 0.22 0.52 – 1.36 <0.001
Item [7] 1.86 0.25 1.37 – 2.36 <0.001
Item [8] 0.21 0.20 -0.19 – 0.61 0.312
Item [9] -0.86 0.21 -1.28 – -0.44 <0.001
Random Effects
σ2 3.29
τ00 ID 1.17
#use wide form data
coef(rm0 <- ltm::rasch(dta1_w, constraint=cbind(ncol(dta1_w)+1, 1)))[,1] 
##      Item1      Item2      Item3      Item4      Item5      Item6      Item7 
## -2.8480443 -1.6661044 -1.3091179 -1.2271704 -0.9939141 -0.9196116 -1.8160398 
##      Item8      Item9 
## -0.1993647  0.8417333
plot(rm0)
grid()

Item characteristic curves

rm0 <- RM(dta1_w)
eRm::plotICC(rm0, item.subset=1:4, ask=F, empICC=list("raw"), empCI=list(lty="solid"))

The item-person map

plotPImap(rm0)

In Class Exercise 2

Teachers from a sample of 16 schools in California and Michigan were asked:

“If you could go back to college and start all over again, would you again choose teaching as a profession?”

The teachers’ perception of task variety was measured by the extent to which teachers followed the same routine each day.

Source: Raudenbush, S.W., Rowan, B., & Cheong, Y. (1993). Teaching as a non-routine task: Implications for the organizational design of schools. Educational Administrative Quarterly, 29(4), 479-500. - Column 1: Response categories: Yes, Not sure, No - Column 2: Task routine - Column 3: School ID

pacman::p_load(tidyverse, HH, ordinal)
dta2<-read.table("teach_again.txt", h=T)
head(dta2)
##   Answer       Task School
## 1    Yes -0.2642783    S01
## 2    Yes  0.5709041    S01
## 3    Yes  0.1329710    S01
## 4 Unsure -0.2642783    S01
## 5     No -1.0994610    S01
## 6    Yes  0.5302202    S01
str(dta2)
## 'data.frame':    650 obs. of  3 variables:
##  $ Answer: chr  "Yes" "Yes" "Yes" "Unsure" ...
##  $ Task  : num  -0.264 0.571 0.133 -0.264 -1.099 ...
##  $ School: chr  "S01" "S01" "S01" "S01" ...
#count table
with(dta2, ftable(School, Answer))
##        Answer No Unsure Yes
## School                     
## S01            5      4  15
## S02            9      9  20
## S03           14     10  17
## S04            0      1   8
## S05            1      5  23
## S06            9     12  18
## S07           12      4  26
## S08            7      5  18
## S09            2      8  18
## S10           21     27  69
## S11           14     12  32
## S12           15     19  18
## S13           13      6  18
## S14            0      1   7
## S15           17     11  25
## S16           13     10  22
#proportion table by School
prop.table(with(dta2, ftable(School, Answer)),1)
##        Answer         No     Unsure        Yes
## School                                        
## S01           0.20833333 0.16666667 0.62500000
## S02           0.23684211 0.23684211 0.52631579
## S03           0.34146341 0.24390244 0.41463415
## S04           0.00000000 0.11111111 0.88888889
## S05           0.03448276 0.17241379 0.79310345
## S06           0.23076923 0.30769231 0.46153846
## S07           0.28571429 0.09523810 0.61904762
## S08           0.23333333 0.16666667 0.60000000
## S09           0.07142857 0.28571429 0.64285714
## S10           0.17948718 0.23076923 0.58974359
## S11           0.24137931 0.20689655 0.55172414
## S12           0.28846154 0.36538462 0.34615385
## S13           0.35135135 0.16216216 0.48648649
## S14           0.00000000 0.12500000 0.87500000
## S15           0.32075472 0.20754717 0.47169811
## S16           0.28888889 0.22222222 0.48888889
#mean by School
aggregate(Task ~ School, mean, data = dta2)
##    School        Task
## 1     S01  0.02089652
## 2     S02 -0.11779710
## 3     S03 -0.17287423
## 4     S04  0.08085690
## 5     S05 -0.14908077
## 6     S06  0.23740790
## 7     S07 -0.13334256
## 8     S08  0.18657692
## 9     S09  0.42902949
## 10    S10  0.01057772
## 11    S11 -0.15770350
## 12    S12 -0.08074803
## 13    S13  0.13885760
## 14    S14  0.89935413
## 15    S15 -0.12168721
## 16    S16 -0.01327148
m <- as.numeric(with(dta2, table(School, Answer)))
m <- as.data.frame(matrix(m, 16, 3))
names(m) <- c("No", "Unsure", "Yes")
rownames(m) <- levels(dta2$School)
m$tot <- apply(m, 1, sum) #tot=sum
m <- m[order((m[,1]+m[,2])/m[,4]), ]
likert(m[, -4], as.percent = T, main="", ylab="")

dtap <- prop.table(with(dta2, ftable(School, Answer)), 1)
dtapl <- as.data.frame(dtap) %>%
         mutate( Answer = factor(Answer))
head(dtapl)
##   School Answer       Freq
## 1    S01     No 0.20833333
## 2    S02     No 0.23684211
## 3    S03     No 0.34146341
## 4    S04     No 0.00000000
## 5    S05     No 0.03448276
## 6    S06     No 0.23076923
#make a cumsum data frame
dtapl$Task <- rep(aggregate(Task ~ School, mean, data = dta2)[,2],3)
dtacp <- data.frame(School = levels(dtapl$School),
                    as.data.frame(t(apply(dtap, 1, cumsum))))
# transform to long form
dtacpl <- gather(dtacp, Answer, Prop, 2:4) %>%
          mutate( Answer = factor(Answer))

ot <- theme_set(theme_bw())
# Plot
ggplot(dtapl, aes(Answer, Freq, group = School)) +
 geom_point(alpha = .5)+
 geom_line(alpha = .5) +
 ylim(c(0, 1)) +
 labs(x = "Answer", y = "Categorical response proprtions")

#Cumulative proportions
ggplot(dtacpl, aes(Answer, Prop, group = School)) +
 geom_point(alpha = .5)+
 geom_line(alpha = .5) +
 labs(x = "Answer", y = "Cumulative proportions")

#Categorical response proprtions
ggplot(dtapl, aes(Task, Freq, color = Answer)) +
 geom_point()+
 stat_smooth(method = "lm", se=F) +
 scale_y_continuous(limits=c(0, 1)) +
 labs(x = "Task", y = "Categorical response proprtions") +
 theme(legend.position = c(.9, .5))

# ordinal package
# cumulative mixed proportional odds model
summary(m0 <- clmm(factor(Answer) ~ Task + (1 | School), data=dta2))
## Cumulative Link Mixed Model fitted with the Laplace approximation
## 
## formula: factor(Answer) ~ Task + (1 | School)
## data:    dta2
## 
##  link  threshold nobs logLik  AIC     niter    max.grad cond.H 
##  logit flexible  650  -642.14 1292.27 177(393) 1.91e-04 6.4e+01
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  School (Intercept) 0.09088  0.3015  
## Number of groups:  School 16 
## 
## Coefficients:
##      Estimate Std. Error z value Pr(>|z|)    
## Task  0.36488    0.08792    4.15 3.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##            Estimate Std. Error z value
## No|Unsure   -1.2659     0.1301  -9.731
## Unsure|Yes  -0.2169     0.1176  -1.844
dta2_m0<- data.frame(dta2, phat = fitted(m0))
ggplot(dta2_m0, aes(Task, phat, color = Answer)) +
 geom_point(alpha = .2, pch = 20)+
 geom_point(data = dtapl, aes(Task, Freq, color = Answer)) +
 stat_smooth(method = "lm", se=F, alpha = .2) +
 stat_smooth(data = dtapl, aes(Task, Freq, color = Answer),
             method = "lm", se=F, linetype = "dotted") +
 scale_y_continuous(limits=(c(0, 1))) +
 labs(x = "Task", y = "Categorical response proprtions") +
 theme(legend.position = c(.1, .8))

# This is done because there are 0 responses in the frequency table
pn <- aggregate(phat ~ School, mean, data=subset(dta2_m0, Answer == "No"), fill = T)
pu <- aggregate(phat ~ School, mean, data=subset(dta2_m0, Answer == "Unsure"))
py <- aggregate(phat ~ School, mean, data=subset(dta2_m0, Answer == "Yes"))
# add phat = 0 to S04 and S14 in the No answer category
#fix(pn)
pn <- rbind(pn, c("S04", 0), c("S14", 0))
# put them in the right order by school
pn <- pn[order(pn$School),]

# append predicted prob to the observed p-table
dtapl$phat <- c(pn[,2], pu[,2], py[,2])
dtapl$phat<- as.numeric(dtapl$phat)
# plot observed categ. prop and fitted prob. against Task
ggplot(dtapl, aes(Task, Freq, color = Answer)) +
 geom_point(alpha = .3)+
 stat_smooth(method = "lm", se = F) +
 geom_point(aes(Task, phat, color = Answer), pch = 1)+
 stat_smooth(aes(Task, phat, color = Answer),
                 method = "lm", se = F, lty = 2, lwd = .8) +
 scale_y_continuous(limits=c(0, 1)) +
 labs(x = "Task", y = "Mean observed and fitted catergorical responses (school)") +
 theme(legend.position = c(.9, .5))

In Class Exercise 3

The dataset is taken from an intervention study on problematic drinking in college students. Alcohol-related problems, as measured by the Rutgers Alcohol Problem Index, were recorded across 2 years (5 time points) for a sample of 881 students.

Source: Atkins, D.C., Baldwin, S.A., Zheng, C., Gallop, R.J., & Neighbors, C. (2013). A tutorial on count regression and zero-altered count models for longitudinal substance use data. Psycholology of Addictive Behavior. 27(1), 166-177. - Column 1: Subject ID - Column 2: Observation ID within subject - Column 3: Gender ID - Column 4: Time in months - Column 5: Rutgers Alcohol Problem Index

pacman::p_load(tidyverse, lme4, glmmTMB, bbmle)
dta3<-read.csv("rapi.csv")
head(dta3)
##      ID Obs_ID Gender Month RAPI
## 1 S1001      1    Men     0    0
## 2 S1001      2    Men     6    0
## 3 S1001      3    Men    18    0
## 4 S1002      1  Women     0    3
## 5 S1002      2  Women     6    6
## 6 S1002      3  Women    12    5
# covert to factor type and add variables for later use
dta3$YearC <- (dta3$Month - mean(unique(dta3$Month))) / 12
# numerical summaries
dim(dta3)
## [1] 3616    6
length(unique(dta3$ID))
## [1] 818
table(table(dta3$ID))
## 
##   1   2   3   4   5 
##  25  38  66 128 561
dta3 %>%
 group_by(Gender, Month) %>%
 summarize( mean(RAPI), var(RAPI), sum(RAPI < 1)/n()) %>%
 as.data.frame
##    Gender Month mean(RAPI) var(RAPI) sum(RAPI < 1)/n()
## 1     Men     0   7.700288  73.80587        0.11815562
## 2     Men     6   8.214744 106.43284        0.14423077
## 3     Men    12   8.039146 130.83060        0.18149466
## 4     Men    18   8.107914 165.65618        0.22661871
## 5     Men    24   7.294776 129.84162        0.25373134
## 6   Women     0   6.335456  49.15957        0.09766454
## 7   Women     6   5.300683  45.99614        0.19817768
## 8   Women    12   4.632319  41.38327        0.25058548
## 9   Women    18   4.990099  62.33737        0.27722772
## 10  Women    24   4.652956  79.29420        0.34961440
ggplot(dta3, aes(RAPI, ..density..)) +
 stat_bin(binwidth = 0.9) +
 facet_grid(Gender ~ as.factor(Month)) +
 labs(x = "Rutgers Alcohol Problem Index") 

ggplot(dta3, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_se") +
 stat_smooth(method = "glm", method.args = list(family = poisson)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.1, .1))+
 theme_bw()

# individual plots
# Randomly select about 1/3 individuals
# set random seed for reproduction
dta3$ID <-factor(dta3$ID)
set.seed(11092016)
n3 <- sample(levels(dta3$ID), round(length(levels(dta3$ID))*1/3))
ggplot(dta3[(dta3$ID %in% n3), ], aes(Month, RAPI, group = ID)) +
  geom_line(alpha = .3) +
  geom_point(alpha = .3) +
  stat_smooth(aes(group = 1), method = "glm",
              method.args = list(family = poisson))+
  facet_wrap(~ Gender) +
  labs(x = "Time (in months)", y = "Rutgers Alcohol Problem Index") +
  theme(legend.position = "NONE")

# Model with  interaction and random cluster by ID
summary(m0 <- glmer(RAPI ~ Gender*YearC + (1 | ID), data = dta3,
                           family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (1 | ID)
##    Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##    24094    24125   -12042    24084     3611 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7683 -1.0923 -0.3250  0.7538 15.4325 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  ID     (Intercept) 1.04     1.02    
## Number of obs: 3616, groups:  ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.55567    0.05691  27.336  < 2e-16 ***
## GenderWomen       -0.34717    0.07503  -4.627 3.71e-06 ***
## YearC             -0.01429    0.01338  -1.068    0.285    
## GenderWomen:YearC -0.12146    0.01910  -6.358 2.05e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.755              
## YearC        0.030 -0.023       
## GndrWmn:YrC -0.021  0.035 -0.700
# add individual observation variability
summary(m1 <- lme4::glmer(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Obs_ID),
                    data = dta3, family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Obs_ID)
##    Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19448.0  19497.5  -9716.0  19432.0     3608 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.03682 -0.53730 -0.03497  0.24218  1.29824 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev. Corr
##  ID:Obs_ID (Intercept) 0.3760   0.6132       
##  ID        (Intercept) 1.1197   1.0582       
##            YearC       0.3352   0.5790   0.64
## Number of obs: 3616, groups:  ID:Obs_ID, 3616; ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.32446    0.06265  21.140  < 2e-16 ***
## GenderWomen       -0.39207    0.08215  -4.772 1.82e-06 ***
## YearC             -0.26337    0.04620  -5.700 1.20e-08 ***
## GenderWomen:YearC -0.19136    0.06025  -3.176  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.753              
## YearC        0.498 -0.371       
## GndrWmn:YrC -0.373  0.497 -0.739
# add random slopes uncorrelated with intercepts
m2 <- lme4::glmer(RAPI ~ Gender*YearC + (YearC - 1 | ID) + (1 | ID) + 
               (1 | ID:Obs_ID),
               data = dta3, family = poisson)
## compare nested models - max-likelihood
anova(m0, m1, m2)
## Data: dta3
## Models:
## m0: RAPI ~ Gender * YearC + (1 | ID)
## m2: RAPI ~ Gender * YearC + (YearC - 1 | ID) + (1 | ID) + (1 | ID:Obs_ID)
## m1: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Obs_ID)
##    npar   AIC   BIC   logLik deviance   Chisq Df Pr(>Chisq)    
## m0    5 24094 24125 -12042.0    24084                          
## m2    7 19562 19606  -9774.3    19548 4535.51  2  < 2.2e-16 ***
## m1    8 19448 19498  -9716.0    19432  116.54  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# residual plot
plot(m2, pch = 20, cex = .6, xlab = "Fitted values")

## Residuals over time
plot(m2, resid(.) ~ fitted(.) | as.factor(Month) + Gender, abline = c(h = 0),
     type = c("p", "smooth"), pch = 20, cex = .6,
     xlab = "Fitted values", ylab = "Pearson residuals")

# normality check
lattice::qqmath(m2, type = c("p", "g"))

# unit-specific rate ratios
print(exp(fixef(m2)), 3)
##       (Intercept)       GenderWomen             YearC GenderWomen:YearC 
##             3.835             0.684             0.844             0.835
# 
VarCorr(m2)
##  Groups    Name        Std.Dev.
##  ID.Obs_ID (Intercept) 0.61779 
##  ID        (Intercept) 0.99996 
##  ID.1      YearC       0.51389
##
# negative binomial, zero-inflated negative binomial
##

# mean linear related to variance
summary(mnb1 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Obs_ID),
                          data = dta3, family = nbinom1))
##  Family: nbinom1  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Obs_ID)
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19416.0  19471.8  -9699.0  19398.0     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance Std.Dev. Corr 
##  ID        (Intercept) 0.9538   0.9766        
##            YearC       0.2497   0.4997   0.64 
##  ID:Obs_ID (Intercept) 0.1922   0.4384        
## Number of obs: 3616, groups:  ID, 818; ID:Obs_ID, 3616
## 
## Overdispersion parameter for nbinom1 family (): 1.06 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.44252    0.06147  23.468  < 2e-16 ***
## GenderWomen       -0.37184    0.07719  -4.817 1.45e-06 ***
## YearC             -0.24691    0.04335  -5.695 1.23e-08 ***
## GenderWomen:YearC -0.17239    0.05613  -3.071  0.00213 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# variance to mean with a quadratic trend
mnb2 <- update(mnb1, . ~ ., family = nbinom2)

summary(mnb2)
##  Family: nbinom2  ( log )
## Formula:          
## RAPI ~ Gender + YearC + (YearC | ID) + (1 | ID:Obs_ID) + Gender:YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19391.5  19447.3  -9686.8  19373.5     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance  Std.Dev.  Corr 
##  ID        (Intercept) 1.105e+00 1.0512343      
##            YearC       3.022e-01 0.5497099 0.67 
##  ID:Obs_ID (Intercept) 5.242e-09 0.0000724      
## Number of obs: 3616, groups:  ID, 818; ID:Obs_ID, 3616
## 
## Overdispersion parameter for nbinom2 family (): 2.52 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.51826    0.06294  24.123  < 2e-16 ***
## GenderWomen       -0.39704    0.08224  -4.828 1.38e-06 ***
## YearC             -0.25902    0.04638  -5.585 2.34e-08 ***
## GenderWomen:YearC -0.19783    0.06015  -3.289  0.00101 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Include zero-inflation term
summary(mznb2 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Obs_ID),
                          zi = ~ Gender + YearC,
                          data = dta3, family = nbinom2))
##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Obs_ID)
## Zero inflation:        ~Gender + YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19297.2  19371.5  -9636.6  19273.2     3604 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance  Std.Dev.  Corr 
##  ID        (Intercept) 1.094e+00 1.046e+00      
##            YearC       2.976e-01 5.456e-01 0.64 
##  ID:Obs_ID (Intercept) 9.483e-09 9.738e-05      
## Number of obs: 3616, groups:  ID, 818; ID:Obs_ID, 3616
## 
## Overdispersion parameter for nbinom2 family (): 3.87 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.55715    0.06323  24.625  < 2e-16 ***
## GenderWomen       -0.38236    0.08250  -4.635 3.57e-06 ***
## YearC             -0.22259    0.04658  -4.779 1.76e-06 ***
## GenderWomen:YearC -0.19707    0.05869  -3.358 0.000786 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.0617     0.2430 -12.598   <2e-16 ***
## GenderWomen   0.2126     0.2901   0.733   0.4638    
## YearC         0.4406     0.1936   2.275   0.0229 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# add dispersion over time
summary(mznb2a <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID), data = dta3,
                           zi = ~ YearC,
                           dispformula = ~ YearC,
                           family = nbinom2))
##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID)
## Zero inflation:        ~YearC
## Dispersion:            ~YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19291.0  19359.1  -9634.5  19269.0     3605 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  ID     (Intercept) 1.0920   1.0450        
##         YearC       0.2881   0.5367   0.66 
## Number of obs: 3616, groups:  ID, 818
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.56590    0.06304  24.842  < 2e-16 ***
## GenderWomen       -0.39309    0.08133  -4.833 1.34e-06 ***
## YearC             -0.22937    0.04761  -4.817 1.45e-06 ***
## GenderWomen:YearC -0.19880    0.05828  -3.411 0.000647 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.9281     0.1601 -18.289  < 2e-16 ***
## YearC         0.5540     0.2024   2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Dispersion model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.39174    0.07603  18.304   <2e-16 ***
## YearC        0.24835    0.11268   2.204   0.0275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# model comparisions - across different models
# need package bbmle
AICtab(mnb1, mnb2, mznb2, mznb2a, base = T, logLik = T)
##        logLik  AIC     dLogLik dAIC    df
## mznb2a -9634.5 19291.0    64.5     0.0 11
## mznb2  -9636.6 19297.2    62.4     6.2 12
## mnb2   -9686.8 19391.5    12.2   100.6 9 
## mnb1   -9699.0 19416.0     0.0   125.0 9
## diagnostics
# fortify data with residuals and fitted values
dta3$rs <- resid(mznb2a)
dta3$yhat <- predict(mznb2a)

## place observed and fitted CIs side-by-side
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_cl_boot") +
 stat_summary(aes(y = yhat), fun.data = "mean_cl_boot", pch = 20, 
              linetype = "dashed", position = position_dodge(width = .9)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.1, .1))

# residual plot
ggplot(dta3, aes(x = yhat, y = scale(rs))) +
  geom_point(col = "steelblue", alpha = I(0.2)) +
  geom_hline(yintercept = 0, linetype = 2) +
  facet_grid(Gender ~ Month) +
  labs(x = "Fitted values", y = "Standardized residuals")