The dataset is taken from an intervention study on problematic drinking in college students.
Alcohol-related problems, as measured by the Rutgers Alcohol Problem Index, were recorded across 2 years (5 time points) for a sample of 881 students.
Source: Atkins, D.C., Baldwin, S.A., Zheng, C., Gallop, R.J., & Neighbors, C. (2013). A tutorial on count regression and zero-altered count models for longitudinal substance use data. Psycholology of Addictive Behavior. 27(1), 166-177.
Column 1: Subject ID Column 2: Observation ID within subject Column 3: Gender ID Column 4: Time in months Column 5: Rutgers Alcohol Problem Index
Reference:https://adai.uw.edu/instruments/pdf/Rutgers_Alcohol_Problem_Index_210.pdf
# set path to working directory
#setwd("C:/Users/Ching-Fan Sheu/Desktop/mlm_glmm")
# set options for the current session
options(digits = 4, show.signif.stars = FALSE)
# load package management
#library(pacman)
# load or install packages to be used
pacman::p_load(tidyverse, lme4, glmmTMB, bbmle)
#input data
dta <- read.csv("rapi.csv")
head(dta,10)
## ID Obs_ID Gender Month RAPI
## 1 S1001 1 Men 0 0
## 2 S1001 2 Men 6 0
## 3 S1001 3 Men 18 0
## 4 S1002 1 Women 0 3
## 5 S1002 2 Women 6 6
## 6 S1002 3 Women 12 5
## 7 S1002 4 Women 18 4
## 8 S1002 5 Women 24 5
## 9 S1003 1 Men 0 9
## 10 S1003 2 Men 12 1
names(dta) <- c("ID","Time","Gender", "Month", "RAPI")
# show data of structure
str(dta)
## 'data.frame': 3616 obs. of 5 variables:
## $ ID : chr "S1001" "S1001" "S1001" "S1002" ...
## $ Time : int 1 2 3 1 2 3 4 5 1 2 ...
## $ Gender: chr "Men" "Men" "Men" "Women" ...
## $ Month : int 0 6 18 0 6 12 18 24 0 12 ...
## $ RAPI : int 0 0 0 3 6 5 4 5 9 1 ...
#show first 8 lines
head(dta,8)
## ID Time Gender Month RAPI
## 1 S1001 1 Men 0 0
## 2 S1001 2 Men 6 0
## 3 S1001 3 Men 18 0
## 4 S1002 1 Women 0 3
## 5 S1002 2 Women 6 6
## 6 S1002 3 Women 12 5
## 7 S1002 4 Women 18 4
## 8 S1002 5 Women 24 5
# covert to factor type and add variables for later use
dta$YearC <- (dta$Month - mean(unique(dta$Month))) / 12 #unique()萃取資料中的element
##
# numerical summaries
dim(dta) #查詢dta各維度長度
## [1] 3616 6
#length()查詢向量的長度
length(unique(dta$ID))
## [1] 818
ID從S1001到S1818
table(table(dta$ID))
##
## 1 2 3 4 5
## 25 38 66 128 561
觀察次數5次的受試者有561位
#
dta %>%
group_by(Gender, Time) %>% #以Team和Time做分組依據
summarize( mean(RAPI), var(RAPI), sum(RAPI < 1)/n()) %>% #計算RAPI的mean、var、sum
as.data.frame #將列表資料轉換為資料框架
## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)
## Gender Time mean(RAPI) var(RAPI) sum(RAPI < 1)/n()
## 1 Men 1 7.700 73.81 0.11816
## 2 Men 2 8.563 123.73 0.14157
## 3 Men 3 7.633 125.05 0.19489
## 4 Men 4 7.861 154.38 0.22420
## 5 Men 5 7.465 127.50 0.26291
## 6 Women 1 6.335 49.16 0.09766
## 7 Women 2 5.625 60.66 0.20174
## 8 Women 3 4.491 39.15 0.25792
## 9 Women 4 4.936 63.55 0.28186
## 10 Women 5 4.422 64.45 0.34483
# set plot theme
ot <- theme_set(theme_bw())
#
# Compare distributions with different counts
ggplot(dta, aes(RAPI, ..density..)) +
stat_bin(binwidth = 0.9) + #stat_bin()使用binwidth = 0.9選寬度
facet_grid(Gender ~ as.factor(Month)) + #facet_grid() forms a matrix of panels defined by row and column faceting variables.
labs(x = "Rutgers Alcohol Problem Index")
一開始(0個月),男性女性不酗酒的機率差不多(density約0.1),追蹤6、12、18、24個月,男性女性不酗酒的機率越高,但女性高於男性。
ggplot(dta, aes(Month, RAPI, col = Gender)) +
stat_summary(fun.data = "mean_se") +
stat_smooth(method = "glm", method.args = list(family = poisson)) +
labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
theme(legend.position = c(.1, .1))
## `geom_smooth()` using formula 'y ~ x'
# Randomly select about 1/3 individuals
# set random seed for reproduction
set.seed(11092016)
# one-third of sample
dta$ID <-factor(dta$ID)
n3 <- sample(dta$ID, round(length(levels(dta$ID))*.33))
#n3 <- sample(levels(dta$ID), round(length(levels(dta$ID))*.33))
#
ggplot(dta[(dta$ID %in% n3), ], aes(Month, RAPI, group = ID)) +
geom_line(alpha = .3) +
geom_point(alpha = .3) +
stat_smooth(aes(group = 1), method = "glm",
method.args = list(family = poisson))+
facet_wrap(~ Gender) +
labs(x = "Time (in months)", y = "Rutgers Alcohol Problem Index") +
theme(legend.position = "NONE")
## `geom_smooth()` using formula 'y ~ x'
# Model with interaction and random cluster by ID
summary(m0 <- glmer(RAPI ~ Gender*YearC + (1 | ID), data = dta,
family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: RAPI ~ Gender * YearC + (1 | ID)
## Data: dta
##
## AIC BIC logLik deviance df.resid
## 24094 24125 -12042 24084 3611
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.768 -1.092 -0.325 0.754 15.433
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 1.04 1.02
## Number of obs: 3616, groups: ID, 818
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5557 0.0569 27.34 < 2e-16
## GenderWomen -0.3472 0.0750 -4.63 3.7e-06
## YearC -0.0143 0.0134 -1.07 0.29
## GenderWomen:YearC -0.1215 0.0191 -6.36 2.0e-10
##
## Correlation of Fixed Effects:
## (Intr) GndrWm YearC
## GenderWomen -0.755
## YearC 0.030 -0.023
## GndrWmn:YrC -0.021 0.035 -0.700
summary(m1 <- glmer(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),data = dta, family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta
##
## AIC BIC logLik deviance df.resid
## 19448 19498 -9716 19432 3608
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.037 -0.537 -0.035 0.242 1.298
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ID:Time (Intercept) 0.376 0.613
## ID (Intercept) 1.120 1.058
## YearC 0.335 0.579 0.64
## Number of obs: 3616, groups: ID:Time, 3616; ID, 818
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.3244 0.0627 21.14 < 2e-16
## GenderWomen -0.3920 0.0822 -4.77 1.8e-06
## YearC -0.2634 0.0462 -5.70 1.2e-08
## GenderWomen:YearC -0.1914 0.0603 -3.18 0.0015
##
## Correlation of Fixed Effects:
## (Intr) GndrWm YearC
## GenderWomen -0.753
## YearC 0.498 -0.371
## GndrWmn:YrC -0.373 0.497 -0.739
plot(m2, pch = 20, cex = .6, xlab = "Fitted values")
plot(m2, resid(.) ~ fitted(.) | as.factor(Month) + Gender, abline = c(h = 0),
type = c("p", "smooth"), pch = 20, cex = .6,
xlab = "Fitted values", ylab = "Pearson residuals")
lattice::qqmath(m2, type = c("p", "g"))
print(exp(fixef(m2)), 3)
## (Intercept) GenderWomen YearC GenderWomen:YearC
## 3.835 0.684 0.844 0.835
VarCorr(m2)
## Groups Name Std.Dev.
## ID.Time (Intercept) 0.618
## ID (Intercept) 1.000
## ID.1 YearC 0.514
# mean linear related to variance
summary(mnb1 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),data = dta, family = nbinom1))
## Family: nbinom1 ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta
##
## AIC BIC logLik deviance df.resid
## 19416 19472 -9699 19398 3607
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 0.954 0.977
## YearC 0.250 0.500 0.64
## ID:Time (Intercept) 0.192 0.438
## Number of obs: 3616, groups: ID, 818; ID:Time, 3616
##
## Overdispersion parameter for nbinom1 family (): 1.06
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4425 0.0615 23.47 < 2e-16
## GenderWomen -0.3718 0.0772 -4.82 1.5e-06
## YearC -0.2469 0.0434 -5.70 1.2e-08
## GenderWomen:YearC -0.1724 0.0561 -3.07 0.0021
# variance to mean with a quadratic trend
mnb2 <- update(mnb1, . ~ ., family = nbinom2)
summary(mnb2)
## Family: nbinom2 ( log )
## Formula:
## RAPI ~ Gender + YearC + (YearC | ID) + (1 | ID:Time) + Gender:YearC
## Data: dta
##
## AIC BIC logLik deviance df.resid
## 19392 19447 -9687 19374 3607
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 1.11e+00 1.05e+00
## YearC 3.02e-01 5.50e-01 0.67
## ID:Time (Intercept) 5.24e-09 7.24e-05
## Number of obs: 3616, groups: ID, 818; ID:Time, 3616
##
## Overdispersion parameter for nbinom2 family (): 2.52
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5183 0.0629 24.12 < 2e-16
## GenderWomen -0.3970 0.0822 -4.83 1.4e-06
## YearC -0.2590 0.0464 -5.58 2.3e-08
## GenderWomen:YearC -0.1978 0.0602 -3.29 0.001
# Include zero-inflation term
summary(mznb2 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),zi = ~ Gender + YearC, data = dta, family = nbinom2))
## Family: nbinom2 ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Zero inflation: ~Gender + YearC
## Data: dta
##
## AIC BIC logLik deviance df.resid
## 19297 19372 -9637 19273 3604
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 1.09e+00 1.05e+00
## YearC 2.98e-01 5.46e-01 0.64
## ID:Time (Intercept) 9.48e-09 9.74e-05
## Number of obs: 3616, groups: ID, 818; ID:Time, 3616
##
## Overdispersion parameter for nbinom2 family (): 3.87
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5572 0.0632 24.63 < 2e-16
## GenderWomen -0.3824 0.0825 -4.63 3.6e-06
## YearC -0.2226 0.0466 -4.78 1.8e-06
## GenderWomen:YearC -0.1971 0.0587 -3.36 0.00079
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.062 0.243 -12.60 <2e-16
## GenderWomen 0.213 0.290 0.73 0.464
## YearC 0.441 0.194 2.28 0.023
# add dispersion over time
summary(mznb2a <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID), data = dta,zi = ~ YearC, dispformula = ~ YearC, family = nbinom2))
## Family: nbinom2 ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID)
## Zero inflation: ~YearC
## Dispersion: ~YearC
## Data: dta
##
## AIC BIC logLik deviance df.resid
## 19291 19359 -9634 19269 3605
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 1.092 1.045
## YearC 0.288 0.537 0.66
## Number of obs: 3616, groups: ID, 818
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5659 0.0630 24.84 < 2e-16
## GenderWomen -0.3931 0.0813 -4.83 1.3e-06
## YearC -0.2294 0.0476 -4.82 1.5e-06
## GenderWomen:YearC -0.1988 0.0583 -3.41 0.00065
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.928 0.160 -18.29 <2e-16
## YearC 0.554 0.202 2.74 0.0062
##
## Dispersion model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.392 0.076 18.3 <2e-16
## YearC 0.248 0.113 2.2 0.028
# need package bbmle
AICtab(mnb1, mnb2, mznb2, mznb2a, base = T, logLik = T)
## logLik AIC dLogLik dAIC df
## mznb2a -9634.5 19291.0 64.5 0.0 11
## mznb2 -9636.6 19297.2 62.4 6.2 12
## mnb2 -9686.8 19391.5 12.2 100.6 9
## mnb1 -9699.0 19416.0 0.0 125.0 9
# fortify data with residuals and fitted values
dta$rs <- resid(mznb2a)
dta$yhat <- predict(mznb2a)
## place observed and fitted CIs side-by-side
ggplot(dta, aes(Month, RAPI, col = Gender)) +
stat_summary(fun.data = "mean_cl_boot") +
stat_summary(aes(y = yhat), fun.data = "mean_cl_boot", pch = 20,
linetype = "dashed", position = position_dodge(width = .9)) +
labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
theme(legend.position = c(.1, .1))
ggplot(dta, aes(x = yhat, y = scale(rs))) +
geom_point(col = "steelblue", alpha = I(0.2)) +
geom_hline(yintercept = 0, linetype = 2) +
facet_grid(Gender ~ Month) +
labs(x = "Fitted values", y = "Standardized residuals")
##
# zero-inflated gamma distribution or mixture of gamma distributions?
###