1 Introduction

The dataset is taken from an intervention study on problematic drinking in college students.

Alcohol-related problems, as measured by the Rutgers Alcohol Problem Index, were recorded across 2 years (5 time points) for a sample of 881 students.

Source: Atkins, D.C., Baldwin, S.A., Zheng, C., Gallop, R.J., & Neighbors, C. (2013). A tutorial on count regression and zero-altered count models for longitudinal substance use data. Psycholology of Addictive Behavior. 27(1), 166-177.

Column 1: Subject ID Column 2: Observation ID within subject Column 3: Gender ID Column 4: Time in months Column 5: Rutgers Alcohol Problem Index

Reference：https://adai.uw.edu/instruments/pdf/Rutgers_Alcohol_Problem_Index_210.pdf

2 Data management

# set path to working directory
#setwd("C:/Users/Ching-Fan Sheu/Desktop/mlm_glmm")

# set options for the current session
options(digits = 4, show.signif.stars = FALSE)

# load package management
#library(pacman)

# load or install packages to be used
pacman::p_load(tidyverse, lme4, glmmTMB, bbmle)

#input data
dta <- read.csv("rapi.csv")
head(dta,10)

##       ID Obs_ID Gender Month RAPI
## 1  S1001      1    Men     0    0
## 2  S1001      2    Men     6    0
## 3  S1001      3    Men    18    0
## 4  S1002      1  Women     0    3
## 5  S1002      2  Women     6    6
## 6  S1002      3  Women    12    5
## 7  S1002      4  Women    18    4
## 8  S1002      5  Women    24    5
## 9  S1003      1    Men     0    9
## 10 S1003      2    Men    12    1

names(dta) <- c("ID","Time","Gender", "Month", "RAPI")

# show data of structure
str(dta)

## 'data.frame':    3616 obs. of  5 variables:
##  $ ID    : chr  "S1001" "S1001" "S1001" "S1002" ...
##  $ Time  : int  1 2 3 1 2 3 4 5 1 2 ...
##  $ Gender: chr  "Men" "Men" "Men" "Women" ...
##  $ Month : int  0 6 18 0 6 12 18 24 0 12 ...
##  $ RAPI  : int  0 0 0 3 6 5 4 5 9 1 ...

#show first 8 lines
head(dta,8)

##      ID Time Gender Month RAPI
## 1 S1001    1    Men     0    0
## 2 S1001    2    Men     6    0
## 3 S1001    3    Men    18    0
## 4 S1002    1  Women     0    3
## 5 S1002    2  Women     6    6
## 6 S1002    3  Women    12    5
## 7 S1002    4  Women    18    4
## 8 S1002    5  Women    24    5

# covert to factor type and add variables for later use
dta$YearC <- (dta$Month - mean(unique(dta$Month))) / 12 #unique()萃取資料中的element

##
# numerical summaries
dim(dta) #查詢dta各維度長度

## [1] 3616    6

#length()查詢向量的長度
length(unique(dta$ID))

## [1] 818

ID從S1001到S1818

table(table(dta$ID))

## 
##   1   2   3   4   5 
##  25  38  66 128 561

觀察次數5次的受試者有561位

#
dta %>%
 group_by(Gender, Time) %>% #以Team和Time做分組依據
 summarize( mean(RAPI), var(RAPI), sum(RAPI < 1)/n()) %>% #計算RAPI的mean、var、sum
 as.data.frame #將列表資料轉換為資料框架

## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)

##    Gender Time mean(RAPI) var(RAPI) sum(RAPI < 1)/n()
## 1     Men    1      7.700     73.81           0.11816
## 2     Men    2      8.563    123.73           0.14157
## 3     Men    3      7.633    125.05           0.19489
## 4     Men    4      7.861    154.38           0.22420
## 5     Men    5      7.465    127.50           0.26291
## 6   Women    1      6.335     49.16           0.09766
## 7   Women    2      5.625     60.66           0.20174
## 8   Women    3      4.491     39.15           0.25792
## 9   Women    4      4.936     63.55           0.28186
## 10  Women    5      4.422     64.45           0.34483

# set plot theme
ot <- theme_set(theme_bw())

# 
# Compare distributions with different counts
ggplot(dta, aes(RAPI, ..density..)) + 
 stat_bin(binwidth = 0.9) + #stat_bin()使用binwidth = 0.9選寬度
 facet_grid(Gender ~ as.factor(Month)) + #facet_grid() forms a matrix of panels defined by row and column faceting variables.
 labs(x = "Rutgers Alcohol Problem Index")

一開始(0個月)，男性女性不酗酒的機率差不多(density約0.1)，追蹤6、12、18、24個月，男性女性不酗酒的機率越高，但女性高於男性。

3 Data visualization

ggplot(dta, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_se") + 
 stat_smooth(method = "glm", method.args = list(family = poisson)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.1, .1))

## `geom_smooth()` using formula 'y ~ x'

4 individual plots

# Randomly select about 1/3 individuals
# set random seed for reproduction
set.seed(11092016)

# one-third of sample
dta$ID <-factor(dta$ID)
n3 <- sample(dta$ID, round(length(levels(dta$ID))*.33))

#n3 <- sample(levels(dta$ID), round(length(levels(dta$ID))*.33))

# 
ggplot(dta[(dta$ID %in% n3), ], aes(Month, RAPI, group = ID)) +
  geom_line(alpha = .3) +
  geom_point(alpha = .3) +
  stat_smooth(aes(group = 1), method = "glm",
              method.args = list(family = poisson))+
  facet_wrap(~ Gender) +
  labs(x = "Time (in months)", y = "Rutgers Alcohol Problem Index") +
  theme(legend.position = "NONE")

## `geom_smooth()` using formula 'y ~ x'

4.1

# Model with  interaction and random cluster by ID
summary(m0 <- glmer(RAPI ~ Gender*YearC + (1 | ID), data = dta,
                           family = poisson))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (1 | ID)
##    Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##    24094    24125   -12042    24084     3611 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.768 -1.092 -0.325  0.754 15.433 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  ID     (Intercept) 1.04     1.02    
## Number of obs: 3616, groups:  ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.5557     0.0569   27.34  < 2e-16
## GenderWomen        -0.3472     0.0750   -4.63  3.7e-06
## YearC              -0.0143     0.0134   -1.07     0.29
## GenderWomen:YearC  -0.1215     0.0191   -6.36  2.0e-10
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.755              
## YearC        0.030 -0.023       
## GndrWmn:YrC -0.021  0.035 -0.700

5 add individual observation variability

summary(m1 <- glmer(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),data = dta, family = poisson))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
##    Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##    19448    19498    -9716    19432     3608 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -2.037 -0.537 -0.035  0.242  1.298 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr
##  ID:Time (Intercept) 0.376    0.613        
##  ID      (Intercept) 1.120    1.058        
##          YearC       0.335    0.579    0.64
## Number of obs: 3616, groups:  ID:Time, 3616; ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.3244     0.0627   21.14  < 2e-16
## GenderWomen        -0.3920     0.0822   -4.77  1.8e-06
## YearC              -0.2634     0.0462   -5.70  1.2e-08
## GenderWomen:YearC  -0.1914     0.0603   -3.18   0.0015
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.753              
## YearC        0.498 -0.371       
## GndrWmn:YrC -0.373  0.497 -0.739

6 add random slopes uncorrelated with intercepts

m2 <- glmer(RAPI ~ Gender*YearC + (YearC - 1 | ID) + (1 | ID) + 
               (1 | ID:Time),
               data = dta, family = poisson)

6.1 compare nested models - max-likelihood

anova(m0, m1, m2)

## Data: dta
## Models:
## m0: RAPI ~ Gender * YearC + (1 | ID)
## m2: RAPI ~ Gender * YearC + (YearC - 1 | ID) + (1 | ID) + (1 | ID:Time)
## m1: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
##    npar   AIC   BIC logLik deviance Chisq Df Pr(>Chisq)
## m0    5 24094 24125 -12042    24084                    
## m2    7 19563 19606  -9774    19549  4536  2     <2e-16
## m1    8 19448 19498  -9716    19432   117  1     <2e-16

7 residual plot

plot(m2, pch = 20, cex = .6, xlab = "Fitted values")

7.1 Residuals over time

plot(m2, resid(.) ~ fitted(.) | as.factor(Month) + Gender, abline = c(h = 0),
     type = c("p", "smooth"), pch = 20, cex = .6,
     xlab = "Fitted values", ylab = "Pearson residuals")

8 normality check

lattice::qqmath(m2, type = c("p", "g"))

9 unit-specific rate ratios

print(exp(fixef(m2)), 3)

##       (Intercept)       GenderWomen             YearC GenderWomen:YearC 
##             3.835             0.684             0.844             0.835

10

VarCorr(m2)

##  Groups  Name        Std.Dev.
##  ID.Time (Intercept) 0.618   
##  ID      (Intercept) 1.000   
##  ID.1    YearC       0.514

10.1

11 negative binomial, zero-inflated negative binomial

11.1

# mean linear related to variance
summary(mnb1 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),data = dta, family = nbinom1))

##  Family: nbinom1  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##    19416    19472    -9699    19398     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev. Corr 
##  ID      (Intercept) 0.954    0.977         
##          YearC       0.250    0.500    0.64 
##  ID:Time (Intercept) 0.192    0.438         
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom1 family (): 1.06 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.4425     0.0615   23.47  < 2e-16
## GenderWomen        -0.3718     0.0772   -4.82  1.5e-06
## YearC              -0.2469     0.0434   -5.70  1.2e-08
## GenderWomen:YearC  -0.1724     0.0561   -3.07   0.0021

# variance to mean with a quadratic trend
mnb2 <- update(mnb1, . ~ ., family = nbinom2)

summary(mnb2)

##  Family: nbinom2  ( log )
## Formula:          
## RAPI ~ Gender + YearC + (YearC | ID) + (1 | ID:Time) + Gender:YearC
## Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##    19392    19447    -9687    19374     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev. Corr 
##  ID      (Intercept) 1.11e+00 1.05e+00      
##          YearC       3.02e-01 5.50e-01 0.67 
##  ID:Time (Intercept) 5.24e-09 7.24e-05      
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom2 family (): 2.52 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.5183     0.0629   24.12  < 2e-16
## GenderWomen        -0.3970     0.0822   -4.83  1.4e-06
## YearC              -0.2590     0.0464   -5.58  2.3e-08
## GenderWomen:YearC  -0.1978     0.0602   -3.29    0.001

# Include zero-inflation term
summary(mznb2 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),zi = ~ Gender + YearC, data = dta, family = nbinom2))

##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Zero inflation:        ~Gender + YearC
## Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##    19297    19372    -9637    19273     3604 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev. Corr 
##  ID      (Intercept) 1.09e+00 1.05e+00      
##          YearC       2.98e-01 5.46e-01 0.64 
##  ID:Time (Intercept) 9.48e-09 9.74e-05      
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom2 family (): 3.87 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.5572     0.0632   24.63  < 2e-16
## GenderWomen        -0.3824     0.0825   -4.63  3.6e-06
## YearC              -0.2226     0.0466   -4.78  1.8e-06
## GenderWomen:YearC  -0.1971     0.0587   -3.36  0.00079
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -3.062      0.243  -12.60   <2e-16
## GenderWomen    0.213      0.290    0.73    0.464
## YearC          0.441      0.194    2.28    0.023

# add dispersion over time
summary(mznb2a <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID), data = dta,zi = ~ YearC, dispformula = ~ YearC, family = nbinom2))

##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID)
## Zero inflation:        ~YearC
## Dispersion:            ~YearC
## Data: dta
## 
##      AIC      BIC   logLik deviance df.resid 
##    19291    19359    -9634    19269     3605 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  ID     (Intercept) 1.092    1.045         
##         YearC       0.288    0.537    0.66 
## Number of obs: 3616, groups:  ID, 818
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.5659     0.0630   24.84  < 2e-16
## GenderWomen        -0.3931     0.0813   -4.83  1.3e-06
## YearC              -0.2294     0.0476   -4.82  1.5e-06
## GenderWomen:YearC  -0.1988     0.0583   -3.41  0.00065
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -2.928      0.160  -18.29   <2e-16
## YearC          0.554      0.202    2.74   0.0062
## 
## Dispersion model:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    1.392      0.076    18.3   <2e-16
## YearC          0.248      0.113     2.2    0.028

12 model comparisions - across different models

# need package bbmle
AICtab(mnb1, mnb2, mznb2, mznb2a, base = T, logLik = T)

##        logLik  AIC     dLogLik dAIC    df
## mznb2a -9634.5 19291.0    64.5     0.0 11
## mznb2  -9636.6 19297.2    62.4     6.2 12
## mnb2   -9686.8 19391.5    12.2   100.6 9 
## mnb1   -9699.0 19416.0     0.0   125.0 9

12.1 diagnostics

# fortify data with residuals and fitted values
dta$rs <- resid(mznb2a)
dta$yhat <- predict(mznb2a)

## place observed and fitted CIs side-by-side
ggplot(dta, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_cl_boot") +
 stat_summary(aes(y = yhat), fun.data = "mean_cl_boot", pch = 20, 
              linetype = "dashed", position = position_dodge(width = .9)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.1, .1))

13 residual plot

ggplot(dta, aes(x = yhat, y = scale(rs))) +
  geom_point(col = "steelblue", alpha = I(0.2)) +
  geom_hline(yintercept = 0, linetype = 2) +
  facet_grid(Gender ~ Month) +
  labs(x = "Fitted values", y = "Standardized residuals")

##
# zero-inflated gamma distribution or mixture of gamma distributions?
###

Week11 in-class exercise3：Rutger alcohol problem index

Ching-Fang Wu

Sun Nov 29 14:44:03 2020