1 Inclass 1

1.1 Info

Problem: Fit the Rasch model to starter data via the generalized linear mixed model framework.
Data: The data are the responses of a sample of 150 individuals to 9 multiple-choice items from a General Certificate of Education O-level mathematics paper containing 60 items in all. The responses are coded 1 for correct answer and 0 for incorrect. The items appear to test ability in two-dimensional Euclidean geometry. The data have one column with 1,350 records ordered by item within individuals. There are no missing data.

Column 1: Subject ID
Column 2: Item ID
Column 3: Answer, 1 = Correct, 0 = Incorrect

1.2 Data management

pacman::p_load(dplyr, tidyr, ltm, eRm, lme4)

dta1 <- read.table("starter.txt", h=T)

names(dta1) <- c("ID","Item","Response")

head(dta1)

##   ID Item Response
## 1  1    1        1
## 2  1    2        1
## 3  1    3        1
## 4  1    4        1
## 5  1    5        1
## 6  1    6        1

1.2.1 long to wide

dta1w <- spread(dta1, key = "Item", value = "Response")

names(dta1w) <- c("ID","Item 1","Item 2","Item 3","Item 4 ", "Item 5", "Item 6", 
                  "Item 7","Item 8", "Item 9")

dta1w <- subset(dta1w, select = -1)

1.3 Correlated binary responses

knitr::kable(cor(dta1w))

	Item 1	Item 2	Item 3	Item 4	Item 5	Item 6	Item 7	Item 8	Item 9
Item 1	1.0000000	0.0982946	0.0542415	0.0444554	0.1769309	0.0611080	0.1176913	0.0236666	0.0521286
Item 2	0.0982946	1.0000000	0.2069345	0.2261335	0.3180732	0.1572070	0.1561738	0.2742122	0.0353553
Item 3	0.0542415	0.2069345	1.0000000	0.2380182	0.2442222	0.2247635	0.1659775	0.1390151	0.0541944
Item 4	0.0444554	0.2261335	0.2380182	1.0000000	0.1874462	0.1357355	0.1883526	0.2601335	0.2025407
Item 5	0.1769309	0.3180732	0.2442222	0.1874462	1.0000000	0.1326535	0.1399923	0.3144546	0.0715628
Item 6	0.0611080	0.1572070	0.2247635	0.1357355	0.1326535	1.0000000	0.1993889	0.0550560	0.1515848
Item 7	0.1176913	0.1561738	0.1659775	0.1883526	0.1399923	0.1993889	1.0000000	0.0898275	0.1840525
Item 8	0.0236666	0.2742122	0.1390151	0.2601335	0.3144546	0.0550560	0.0898275	1.0000000	0.1702513
Item 9	0.0521286	0.0353553	0.0541944	0.2025407	0.0715628	0.1515848	0.1840525	0.1702513	1.0000000

1.4 Item response probabilties

knitr::kable(descript(dta1w)$perc[c(1, 7, 2, 3, 4, 5, 6, 8, 9),])

	0	1	logit
Item 1	0.0800000	0.9200000	2.4423470
Item 7	0.1800000	0.8200000	1.5163475
Item 2	0.2000000	0.8000000	1.3862944
Item 3	0.2533333	0.7466667	1.0809127
Item 4	0.2666667	0.7333333	1.0116009
Item 5	0.3066667	0.6933333	0.8157495
Item 6	0.3200000	0.6800000	0.7537718
Item 8	0.4600000	0.5400000	0.1603427
Item 9	0.6666667	0.3333333	-0.6931472

1.5 Plot

1.5.1 Item response curves

plot(descript(dta1w), 
     ylim=c(0,1), 
     type='b',
     main="Item response curves")
grid()

### Item Charateristic curves

Fit the Rasch model with rasch

coef(rm0 <- ltm::rasch(dta1w, constraint=cbind(ncol(dta1w)+1, 1)))[,1]

##     Item 1     Item 2     Item 3    Item 4      Item 5     Item 6     Item 7 
## -2.8480443 -1.6661044 -1.3091179 -1.2271704 -0.9939141 -0.9196116 -1.8160398 
##     Item 8     Item 9 
## -0.1993647  0.8417333

plot(rm0)

grid()

1.5.2 ICC plot

rm0 <- RM(dta1w)

plotICC(rm0, item.subset=1:9, ask=F, empICC=list("raw"), empCI=list(lty="solid"))

1.5.3 Person-Item Map

plotPImap(rm0)

1.6 GLMM

# dta1$ID <- factor(dta1$ID, paste0("P", 1000 + (1:dim(dta1)[1])))
dta1 <- dta1 %>% 
  mutate(ID = factor(ID),
         Item = factor(Item, levels = c("1", "7", "2", "3", "4", "5", "6", "8", "9")),
         Response = factor(Response))

sjPlot::tab_model(m0 <- glmer(Response ~ -1 + Item + (1 | ID), 
                              data = dta1, family = binomial), 
                  show.obs=F, show.ngroups=F, transform=NULL, 
                  show.se=T, show.r2=F,show.icc=F)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.289001 (tol = 0.002, component 1)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

	Response
Predictors	Log-Odds	std. Error	CI	p
Item [1]	2.90	0.00	2.90 – 2.91	<0.001
Item [7]	1.86	0.00	1.86 – 1.87	<0.001
Item [2]	1.70	0.00	1.70 – 1.71	<0.001
Item [3]	1.34	0.00	1.34 – 1.34	<0.001
Item [4]	1.21	0.21	0.79 – 1.63	<0.001
Item [5]	1.00	0.21	0.60 – 1.41	<0.001
Item [6]	0.92	0.20	0.52 – 1.33	<0.001
Item [8]	0.20	0.00	0.20 – 0.21	<0.001
Item [9]	-0.88	0.21	-1.28 – -0.48	<0.001
Random Effects
σ²	3.29
τ₀₀ _ID	1.16

# **Model is nearly unidentifiable: very large eigenvalue need for Rescale variables**

unlist(summary(m0)$varcor)/(unlist(summary(m0)$varcor) + pi^2/3)

##        ID 
## 0.2614461

1.7 Reference

Source: Goldstein, H., & Wood, R. (1989). Five decades of item response modelling, British Journal of Mathematical and Statistical Psychology, 42, 139-67.

2 Inclass 2

2.1 Info

Problem: Replicate the results of analysis of clustered ordinal data reported in the teach again example.
Data: Teachers from a sample of 16 schools in California and Michigan were asked: “If you could go back to college and start all over again, would you again choose teaching as a profession?” The teachers’ perception of task variety was measured by the extent to which teachers followed the same routine each day.

Column 1: Response categories: Yes, Not sure, No
Column 2: Task routine
Column 3: School ID

2.2 Data management

# mixed-effects cumulative logit model (proportional odds)

# loading package
pacman::p_load(tidyverse, HH, ordinal)

dta2 <- read.table("teach_again.txt", h=T)

head(dta2)

##   Answer       Task School
## 1    Yes -0.2642783    S01
## 2    Yes  0.5709041    S01
## 3    Yes  0.1329710    S01
## 4 Unsure -0.2642783    S01
## 5     No -1.0994610    S01
## 6    Yes  0.5302202    S01

str(dta2)

## 'data.frame':    650 obs. of  3 variables:
##  $ Answer: chr  "Yes" "Yes" "Yes" "Unsure" ...
##  $ Task  : num  -0.264 0.571 0.133 -0.264 -1.099 ...
##  $ School: chr  "S01" "S01" "S01" "S01" ...

2.3 Descriptive info

2.3.1 Numerical summary

with(dta2, ftable(School, Answer))

##        Answer No Unsure Yes
## School                     
## S01            5      4  15
## S02            9      9  20
## S03           14     10  17
## S04            0      1   8
## S05            1      5  23
## S06            9     12  18
## S07           12      4  26
## S08            7      5  18
## S09            2      8  18
## S10           21     27  69
## S11           14     12  32
## S12           15     19  18
## S13           13      6  18
## S14            0      1   7
## S15           17     11  25
## S16           13     10  22

2.3.2 Response proportion in each school

dta2p <- prop.table(with(dta2, ftable(School, Answer)), 1)

The teacher in the School 4 & 14 are more likely to choose teaching as a profession again.

aggregate(Task ~ School, mean, data = dta2)

##    School        Task
## 1     S01  0.02089652
## 2     S02 -0.11779710
## 3     S03 -0.17287423
## 4     S04  0.08085690
## 5     S05 -0.14908077
## 6     S06  0.23740790
## 7     S07 -0.13334256
## 8     S08  0.18657692
## 9     S09  0.42902949
## 10    S10  0.01057772
## 11    S11 -0.15770350
## 12    S12 -0.08074803
## 13    S13  0.13885760
## 14    S14  0.89935413
## 15    S15 -0.12168721
## 16    S16 -0.01327148

The highest mean value of teachers’ perception of task variety is school 14 (0.899).

2.4 Likert scale plots

# HH packge for likert scale plots

# 1:48 (16 school * 3 type of answer)
m <- as.numeric(with(dta2, table(School, Answer)))
m <- as.data.frame(matrix(m, 16, 3))
names(m) <- c("No", "Unsure", "Yes")

# rowname base on the school number? not quite sure
rownames(m) <- levels(dta2$School)

# total of teacher?
m$tot <- apply(m, 1, sum)

# No+Unsure/Total
m <- m[order((m[,1]+m[,2])/m[,4]), ]

# Likert plot
likert(m[, -4], as.percent = T, main="", ylab="")

2.5 set

# set the data frame and set "Answer" as the factor 
dta2pl <- as.data.frame(dta2p) %>% 
  mutate(Answer = factor(Answer))

# add the mean value of Task
dta2pl$Task <- rep(aggregate(Task ~ School, mean, data = dta2)[,2],3)

# ?? not sure ??
dta2cp <- data.frame(School = levels(dta2pl$School),
                    as.data.frame(t(apply(dta2p, 1, cumsum))))

# wide to long form
dta2cpl <- gather(dta2cp, Answer, Prop, 2:4) %>%
          mutate(Answer = factor(Answer))

# theme_set
ot <- theme_set(theme_bw())

# 
ggplot(dta2pl, aes(Answer, Freq, group = School)) +
  geom_point(alpha = .5)+
  geom_line(alpha = .5) +
  ylim(c(0, 1)) +
  labs(x = "Answer", y = "Categorical response proprtions")

#
ggplot(dta2cpl, aes(Answer, Prop, group = School)) +
  geom_point(alpha = .5)+
  geom_line(alpha = .5) +
  labs(x = "Answer", y = "Cumulative proportions")

#
ggplot(dta2pl, aes(Task, Freq, color = Answer)) +
  geom_point()+
  stat_smooth(method = "lm", se=F) +
  scale_y_continuous(limits=c(0, 1)) +
  labs(x = "Task", y = "Categorical response proprtions") +
  theme(legend.position = c(.9, .5))

## `geom_smooth()` using formula 'y ~ x'

# ordinal package
# cumulative mixed proportional odds model

dta2 <- dta2 %>%
  mutate(Answer = factor(Answer))

summary(m0 <- clmm(Answer ~ Task + (1 | School), data=dta2))

## Cumulative Link Mixed Model fitted with the Laplace approximation
## 
## formula: Answer ~ Task + (1 | School)
## data:    dta2
## 
##  link  threshold nobs logLik  AIC     niter    max.grad cond.H 
##  logit flexible  650  -642.14 1292.27 177(393) 1.91e-04 6.4e+01
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  School (Intercept) 0.09088  0.3015  
## Number of groups:  School 16 
## 
## Coefficients:
##      Estimate Std. Error z value Pr(>|z|)    
## Task  0.36488    0.08792    4.15 3.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##            Estimate Std. Error z value
## No|Unsure   -1.2659     0.1301  -9.731
## Unsure|Yes  -0.2169     0.1176  -1.844

dta2_m0 <- data.frame(na.omit(dta2), phat = fitted(m0))
#

ggplot(dta2_m0, aes(Task, phat, color = Answer)) +
  geom_point(alpha = .2, pch = 20)+
  geom_point(data = dta2pl, aes(Task, Freq, color = Answer)) +
  stat_smooth(method = "lm", se=F, alpha = .2) +
  stat_smooth(data = dta2pl, aes(Task, Freq, color = Answer),
              method = "lm", se=F, linetype = "dotted") +
  scale_y_continuous(limits=(c(0, 1))) +
  labs(x = "Task", y = "Categorical response proprtions") +
  theme(legend.position = c(.1, .8))

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#
# This is done because there are 0 responses in the frequency table
pn <- aggregate(phat ~ School, mean, data=subset(dta2_m0, Answer == "No"), fill = T)
pu <- aggregate(phat ~ School, mean, data=subset(dta2_m0, Answer == "Unsure"))
py <- aggregate(phat ~ School, mean, data=subset(dta2_m0, Answer == "Yes"))

# add phat = 0 to S04 and S14 in the No answer category
# fix(pn)
pn <- rbind(pn, c("S04", 0), c("S14", 0)) 

# put them in the right order by school
pn <- pn[order(pn$School),]

# append predicted prob to the observed p-table
dta2pl$phat <- c(pn[,2], pu[,2], py[,2])

dta2pl <- dta2pl %>% 
  mutate(phat = as.numeric(phat))

# plot observed categ. prop and fitted prob. against Task
ggplot(dta2pl, aes(Task, Freq, color = Answer)) +
  geom_point(alpha = .3)+
  stat_smooth(method = "lm", se = F) +
  geom_point(aes(Task, phat, color = Answer), pch = 1)+
  stat_smooth(aes(Task, phat, color = Answer),
                  method = "lm", se = F, lty = 2, lwd = .8) +
  scale_y_continuous(limits=c(0, 1)) +
  labs(x = "Task", y = "Mean observed and fitted catergorical responses (school)") +
  theme(legend.position = c(.9, .5))

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# The end

2.6 Reference

Source: Raudenbush, S.W., Rowan, B., & Cheong, Y. (1993). Teaching as a non-routine task: Implications for the organizational design of schools. Educational Administrative Quarterly, 29(4), 479-500.

3 Inclass 3

3.1 Info

Problem: Reproduce the results of analysis in the Rutger alcohol problem index example of longitudinal count data using generalized linear mixed models.
Data: The dataset is taken from an intervention study on problematic drinking in college students. Alcohol-related problems, as measured by the Rutgers Alcohol Problem Index, were recorded across 2 years (5 time points) for a sample of 881 students.

Column 1: Subject ID
Column 2: Observation ID within subject
Column 3: Gender ID
Column 4: Time in months
Column 5: Rutgers Alcohol Problem Index

3.2 Data management

# load or install packages to be used
pacman::p_load(tidyverse, lme4, glmmTMB, bbmle)

dta3 <- read.csv("rapi.csv")

names(dta3) <- c("ID","Time","Gender", "Month", "RAPI")

str(dta3)

## 'data.frame':    3616 obs. of  5 variables:
##  $ ID    : chr  "S1001" "S1001" "S1001" "S1002" ...
##  $ Time  : int  1 2 3 1 2 3 4 5 1 2 ...
##  $ Gender: chr  "Men" "Men" "Men" "Women" ...
##  $ Month : int  0 6 18 0 6 12 18 24 0 12 ...
##  $ RAPI  : int  0 0 0 3 6 5 4 5 9 1 ...

head(dta3)

##      ID Time Gender Month RAPI
## 1 S1001    1    Men     0    0
## 2 S1001    2    Men     6    0
## 3 S1001    3    Men    18    0
## 4 S1002    1  Women     0    3
## 5 S1002    2  Women     6    6
## 6 S1002    3  Women    12    5

# covert to factor type and add variables for later use
dta3$YearC <- (dta3$Month - mean(unique(dta3$Month))) / 12

##

# numerical summaries
dim(dta3)

## [1] 3616    6

length(unique(dta3$ID))

## [1] 818

table(table(dta3$ID))

## 
##   1   2   3   4   5 
##  25  38  66 128 561

#
dta3 %>%
 group_by(Gender, Time) %>%
 summarize( mean(RAPI), var(RAPI), sum(RAPI < 1)/n()) %>%
 as.data.frame

## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)

##    Gender Time mean(RAPI) var(RAPI) sum(RAPI < 1)/n()
## 1     Men    1   7.700288  73.80587        0.11815562
## 2     Men    2   8.563253 123.72710        0.14156627
## 3     Men    3   7.632588 125.04727        0.19488818
## 4     Men    4   7.861210 154.37710        0.22419929
## 5     Men    5   7.464789 127.50465        0.26291080
## 6   Women    1   6.335456  49.15957        0.09766454
## 7   Women    2   5.624729  60.65669        0.20173536
## 8   Women    3   4.490950  39.14844        0.25791855
## 9   Women    4   4.936275  63.54875        0.28186275
## 10  Women    5   4.422414  64.45218        0.34482759

# set plot theme
ot <- theme_set(theme_bw())

#

# Compare distributions with different counts
ggplot(dta3, aes(RAPI, ..density..)) +
 stat_bin(binwidth = 0.9) +
 facet_grid(Gender ~ as.factor(Month)) +
 labs(x = "Rutgers Alcohol Problem Index")

#
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_se") +
 stat_smooth(method = "glm", method.args = list(family = poisson)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.15, .15))

## `geom_smooth()` using formula 'y ~ x'

# individual plots
# Randomly select about 1/3 individuals
# set random seed for reproduction
set.seed(11092016)

# one-third of sample
dta3$ID <-factor(dta3$ID)
n3 <- sample(dta3$ID, round(length(levels(dta3$ID))*1/3))

# 
ggplot(dta3[(dta3$ID %in% n3), ], aes(Month, RAPI, group = ID)) +
  geom_line(alpha = .3) +
  geom_point(alpha = .3) +
  stat_smooth(aes(group = 1), method = "glm",
              method.args = list(family = poisson))+
  facet_wrap( ~ Gender) +
  labs(x = "Time (in months)", y = "Rutgers Alcohol Problem Index") +
  theme(legend.position = "NONE")

## `geom_smooth()` using formula 'y ~ x'

##
# Model with  interaction and random cluster by ID
summary(m0 <- glmer(RAPI ~ Gender*YearC + (1 | ID), data = dta3,
                           family = poisson))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (1 | ID)
##    Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##    24094    24125   -12042    24084     3611 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7683 -1.0923 -0.3250  0.7538 15.4325 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  ID     (Intercept) 1.04     1.02    
## Number of obs: 3616, groups:  ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.55567    0.05691  27.337  < 2e-16 ***
## GenderWomen       -0.34717    0.07502  -4.628 3.70e-06 ***
## YearC             -0.01429    0.01338  -1.068    0.285    
## GenderWomen:YearC -0.12146    0.01910  -6.358 2.05e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.755              
## YearC        0.030 -0.023       
## GndrWmn:YrC -0.021  0.035 -0.700

# add individual observation variability
summary(m1 <- glmer(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
                    data = dta3, family = poisson))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
##    Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19448.0  19497.5  -9716.0  19432.0     3608 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.03681 -0.53730 -0.03497  0.24218  1.29823 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr
##  ID:Time (Intercept) 0.3760   0.6132       
##  ID      (Intercept) 1.1197   1.0581       
##          YearC       0.3352   0.5790   0.64
## Number of obs: 3616, groups:  ID:Time, 3616; ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.32445    0.06265  21.140  < 2e-16 ***
## GenderWomen       -0.39203    0.08215  -4.772 1.82e-06 ***
## YearC             -0.26335    0.04620  -5.700 1.20e-08 ***
## GenderWomen:YearC -0.19135    0.06025  -3.176  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.753              
## YearC        0.498 -0.371       
## GndrWmn:YrC -0.373  0.497 -0.739

# add random slopes uncorrelated with intercepts
m2 <- glmer(RAPI ~ Gender*YearC + (YearC - 1 | ID) + (1 | ID) + 
               (1 | ID:Time),
               data = dta3, family = poisson)

## compare nested models - max-likelihood
anova(m0, m1, m2)

## Data: dta3
## Models:
## m0: RAPI ~ Gender * YearC + (1 | ID)
## m2: RAPI ~ Gender * YearC + (YearC - 1 | ID) + (1 | ID) + (1 | ID:Time)
## m1: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
##    npar   AIC   BIC   logLik deviance   Chisq Df Pr(>Chisq)    
## m0    5 24094 24125 -12042.0    24084                          
## m2    7 19563 19606  -9774.3    19549 4535.51  2  < 2.2e-16 ***
## m1    8 19448 19498  -9716.0    19432  116.54  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# residual plot
plot(m2, pch = 20, cex = .6, xlab = "Fitted values")

## Residuals over time
plot(m2, resid(.) ~ fitted(.) | as.factor(Month) + Gender, abline = c(h = 0),
     type = c("p", "smooth"), pch = 20, cex = .6,
     xlab = "Fitted values", ylab = "Pearson residuals")

# normality check
lattice::qqmath(m2, type = c("p", "g"))

# unit-specific rate ratios
print(exp(fixef(m2)), 3)

##       (Intercept)       GenderWomen             YearC GenderWomen:YearC 
##             3.835             0.684             0.844             0.835

#
VarCorr(m2)

##  Groups  Name        Std.Dev.
##  ID.Time (Intercept) 0.61780 
##  ID      (Intercept) 0.99997 
##  ID.1    YearC       0.51389

##
# negative binomial, zero-inflated negative binomial
##

# mean linear related to variance
summary(mnb1 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
                          data = dta3, family = nbinom1))

##  Family: nbinom1  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19416.0  19471.8  -9699.0  19398.0     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev. Corr 
##  ID      (Intercept) 0.9538   0.9766        
##          YearC       0.2497   0.4997   0.64 
##  ID:Time (Intercept) 0.1922   0.4384        
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom1 family (): 1.06 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.44252    0.06147  23.468  < 2e-16 ***
## GenderWomen       -0.37184    0.07719  -4.817 1.45e-06 ***
## YearC             -0.24691    0.04335  -5.695 1.23e-08 ***
## GenderWomen:YearC -0.17239    0.05613  -3.071  0.00213 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# variance to mean with a quadratic trend
mnb2 <- update(mnb1, . ~ ., family = nbinom2)

summary(mnb2)

##  Family: nbinom2  ( log )
## Formula:          
## RAPI ~ Gender + YearC + (YearC | ID) + (1 | ID:Time) + Gender:YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19391.5  19447.3  -9686.8  19373.5     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance  Std.Dev.  Corr 
##  ID      (Intercept) 1.105e+00 1.0512343      
##          YearC       3.022e-01 0.5497099 0.67 
##  ID:Time (Intercept) 5.242e-09 0.0000724      
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom2 family (): 2.52 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.51826    0.06294  24.123  < 2e-16 ***
## GenderWomen       -0.39704    0.08224  -4.828 1.38e-06 ***
## YearC             -0.25902    0.04638  -5.585 2.34e-08 ***
## GenderWomen:YearC -0.19783    0.06015  -3.289  0.00101 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Include zero-inflation term
summary(mznb2 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
                          zi = ~ Gender + YearC,
                          data = dta3, family = nbinom2))

##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Zero inflation:        ~Gender + YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19297.2  19371.5  -9636.6  19273.2     3604 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance  Std.Dev.  Corr 
##  ID      (Intercept) 1.094e+00 1.046e+00      
##          YearC       2.976e-01 5.456e-01 0.64 
##  ID:Time (Intercept) 9.484e-09 9.739e-05      
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom2 family (): 3.87 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.55715    0.06323  24.625  < 2e-16 ***
## GenderWomen       -0.38236    0.08250  -4.635 3.57e-06 ***
## YearC             -0.22259    0.04658  -4.779 1.76e-06 ***
## GenderWomen:YearC -0.19707    0.05869  -3.358 0.000786 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.0617     0.2430 -12.598   <2e-16 ***
## GenderWomen   0.2126     0.2901   0.733   0.4638    
## YearC         0.4406     0.1936   2.275   0.0229 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# add dispersion over time
summary(mznb2a <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID), data = dta3,
                           zi = ~ YearC,
                           dispformula = ~ YearC,
                           family = nbinom2))

##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID)
## Zero inflation:        ~YearC
## Dispersion:            ~YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19291.0  19359.1  -9634.5  19269.0     3605 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  ID     (Intercept) 1.0920   1.0450        
##         YearC       0.2881   0.5367   0.66 
## Number of obs: 3616, groups:  ID, 818
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.56590    0.06304  24.842  < 2e-16 ***
## GenderWomen       -0.39309    0.08133  -4.833 1.34e-06 ***
## YearC             -0.22937    0.04761  -4.817 1.45e-06 ***
## GenderWomen:YearC -0.19880    0.05828  -3.411 0.000647 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.9281     0.1601 -18.289  < 2e-16 ***
## YearC         0.5540     0.2024   2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Dispersion model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.39174    0.07603  18.304   <2e-16 ***
## YearC        0.24835    0.11268   2.204   0.0275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# model comparisions - across different models
# need package bbmle
AICtab(mnb1, mnb2, mznb2, mznb2a, base = T, logLik = T)

##        logLik  AIC     dLogLik dAIC    df
## mznb2a -9634.5 19291.0    64.5     0.0 11
## mznb2  -9636.6 19297.2    62.4     6.2 12
## mnb2   -9686.8 19391.5    12.2   100.6 9 
## mnb1   -9699.0 19416.0     0.0   125.0 9

## diagnostics
# fortify data with residuals and fitted values
dta3$rs <- resid(mznb2a)
dta3$yhat <- predict(mznb2a)

## place observed and fitted CIs side-by-side
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_cl_boot") +
 stat_summary(aes(y = yhat), fun.data = "mean_cl_boot", pch = 20, 
              linetype = "dashed", position = position_dodge(width = .9)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.1, .1))

# residual plot
ggplot(dta3, aes(x = yhat, y = scale(rs))) +
  geom_point(col = "steelblue", alpha = I(0.2)) +
  geom_hline(yintercept = 0, linetype = 2) +
  facet_grid(Gender ~ Month) +
  labs(x = "Fitted values", y = "Standardized residuals")

##
# zero-inflated gamma distribution or mixture of gamma distributions?
###

3.3 Reference

Source: Atkins, D.C., Baldwin, S.A., Zheng, C., Gallop, R.J., & Neighbors, C. (2013). A tutorial on count regression and zero-altered count models for longitudinal substance use data. Psycholology of Addictive Behavior. 27(1), 166-177.

Inclass: Generalized linear mixed models

Chang-Chun Chen

2020-11-26