# load or install packages to be used
pacman::p_load(tidyverse, lme4, glmmTMB, bbmle)
dta3 <- read.csv("rapi.csv")
names(dta3) <- c("ID","Time","Gender", "Month", "RAPI")
str(dta3)
## 'data.frame': 3616 obs. of 5 variables:
## $ ID : chr "S1001" "S1001" "S1001" "S1002" ...
## $ Time : int 1 2 3 1 2 3 4 5 1 2 ...
## $ Gender: chr "Men" "Men" "Men" "Women" ...
## $ Month : int 0 6 18 0 6 12 18 24 0 12 ...
## $ RAPI : int 0 0 0 3 6 5 4 5 9 1 ...
## ID Time Gender Month RAPI
## 1 S1001 1 Men 0 0
## 2 S1001 2 Men 6 0
## 3 S1001 3 Men 18 0
## 4 S1002 1 Women 0 3
## 5 S1002 2 Women 6 6
## 6 S1002 3 Women 12 5
# covert to factor type and add variables for later use
dta3$YearC <- (dta3$Month - mean(unique(dta3$Month))) / 12
# numerical summaries
dim(dta3)
## [1] 3616 6
## [1] 818
##
## 1 2 3 4 5
## 25 38 66 128 561
#
dta3 %>%
group_by(Gender, Time) %>%
summarize( mean(RAPI), var(RAPI), sum(RAPI < 1)/n()) %>%
as.data.frame
## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)
## Gender Time mean(RAPI) var(RAPI) sum(RAPI < 1)/n()
## 1 Men 1 7.700288 73.80587 0.11815562
## 2 Men 2 8.563253 123.72710 0.14156627
## 3 Men 3 7.632588 125.04727 0.19488818
## 4 Men 4 7.861210 154.37710 0.22419929
## 5 Men 5 7.464789 127.50465 0.26291080
## 6 Women 1 6.335456 49.15957 0.09766454
## 7 Women 2 5.624729 60.65669 0.20173536
## 8 Women 3 4.490950 39.14844 0.25791855
## 9 Women 4 4.936275 63.54875 0.28186275
## 10 Women 5 4.422414 64.45218 0.34482759
#..density..
ggplot(dta3, aes(RAPI, ..density..)) +
stat_bin(binwidth = 0.9) +
facet_grid(Gender ~ as.factor(Month)) +
labs(x = "Rutgers Alcohol Problem Index")
#glm
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
stat_summary(fun.data = "mean_se") +
stat_smooth(method = "glm", method.args = list(family = poisson)) +
labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
theme(legend.position = c(.15, .15))
## `geom_smooth()` using formula 'y ~ x'
# Randomly select about 1/3 individuals
# set random seed for reproduction
set.seed(11092016)
# one-third of sample
dta3$ID <-factor(dta3$ID)
n3 <- sample(dta3$ID, round(length(levels(dta3$ID))*1/3))
#
ggplot(dta3[(dta3$ID %in% n3), ], aes(Month, RAPI, group = ID)) +
geom_line(alpha = .3) +
geom_point(alpha = .3) +
stat_smooth(aes(group = 1), method = "glm",
method.args = list(family = poisson))+
facet_wrap( ~ Gender) +
labs(x = "Time (in months)", y = "Rutgers Alcohol Problem Index") +
theme(legend.position = "NONE")
## `geom_smooth()` using formula 'y ~ x'
alcohol problemm index 個體間的差異很大
# Model with interaction and random cluster by ID
summary(m0 <- glmer(RAPI ~ Gender*YearC + (1 | ID), data = dta3,
family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: RAPI ~ Gender * YearC + (1 | ID)
## Data: dta3
##
## AIC BIC logLik deviance df.resid
## 24094 24125 -12042 24084 3611
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.7683 -1.0923 -0.3250 0.7538 15.4325
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 1.04 1.02
## Number of obs: 3616, groups: ID, 818
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.55567 0.05691 27.337 < 2e-16 ***
## GenderWomen -0.34717 0.07502 -4.628 3.70e-06 ***
## YearC -0.01429 0.01338 -1.068 0.285
## GenderWomen:YearC -0.12146 0.01910 -6.358 2.05e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GndrWm YearC
## GenderWomen -0.755
## YearC 0.030 -0.023
## GndrWmn:YrC -0.021 0.035 -0.700
# add individual observation variability
summary(m1 <- glmer(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
data = dta3, family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta3
##
## AIC BIC logLik deviance df.resid
## 19448.0 19497.5 -9716.0 19432.0 3608
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.03681 -0.53730 -0.03497 0.24218 1.29823
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ID:Time (Intercept) 0.3760 0.6132
## ID (Intercept) 1.1197 1.0581
## YearC 0.3352 0.5790 0.64
## Number of obs: 3616, groups: ID:Time, 3616; ID, 818
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.32445 0.06265 21.140 < 2e-16 ***
## GenderWomen -0.39203 0.08215 -4.772 1.82e-06 ***
## YearC -0.26335 0.04620 -5.700 1.20e-08 ***
## GenderWomen:YearC -0.19135 0.06025 -3.176 0.00149 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GndrWm YearC
## GenderWomen -0.753
## YearC 0.498 -0.371
## GndrWmn:YrC -0.373 0.497 -0.739
# add random slopes uncorrelated with intercepts
m2 <- glmer(RAPI ~ Gender*YearC + (YearC - 1 | ID) + (1 | ID) +
(1 | ID:Time),
data = dta3, family = poisson)
## compare nested models - max-likelihood
anova(m0, m1, m2)
## Data: dta3
## Models:
## m0: RAPI ~ Gender * YearC + (1 | ID)
## m2: RAPI ~ Gender * YearC + (YearC - 1 | ID) + (1 | ID) + (1 | ID:Time)
## m1: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m0 5 24094 24125 -12042.0 24084
## m2 7 19563 19606 -9774.3 19549 4535.51 2 < 2.2e-16 ***
## m1 8 19448 19498 -9716.0 19432 116.54 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m1配適度最好(AIC最小)且顯著與m0不同
## Residuals over time
plot(m2, resid(.) ~ fitted(.) | as.factor(Month) + Gender, abline = c(h = 0),
type = c("p", "smooth"), pch = 20, cex = .6,
xlab = "Fitted values", ylab = "Pearson residuals")
## (Intercept) GenderWomen YearC GenderWomen:YearC
## 3.835 0.684 0.844 0.835
## Groups Name Std.Dev.
## ID.Time (Intercept) 0.61780
## ID (Intercept) 0.99997
## ID.1 YearC 0.51389
# mean linear related to variance
summary(mnb1 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
data = dta3, family = nbinom1))
## Family: nbinom1 ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta3
##
## AIC BIC logLik deviance df.resid
## 19416.0 19471.8 -9699.0 19398.0 3607
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 0.9538 0.9766
## YearC 0.2497 0.4997 0.64
## ID:Time (Intercept) 0.1922 0.4384
## Number of obs: 3616, groups: ID, 818; ID:Time, 3616
##
## Overdispersion parameter for nbinom1 family (): 1.06
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.44252 0.06147 23.468 < 2e-16 ***
## GenderWomen -0.37184 0.07719 -4.817 1.45e-06 ***
## YearC -0.24691 0.04335 -5.695 1.23e-08 ***
## GenderWomen:YearC -0.17239 0.05613 -3.071 0.00213 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## standardGeneric for "summary" defined from package "base"
##
## function (object, ...)
## standardGeneric("summary")
## <environment: 0x00000000130cb9d0>
## Methods may be defined for arguments: object
## Use showMethods("summary") for currently available ones.
# Include zero-inflation term
summary(mznb2 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
zi = ~ Gender + YearC,
data = dta3, family = nbinom2))
## Family: nbinom2 ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Zero inflation: ~Gender + YearC
## Data: dta3
##
## AIC BIC logLik deviance df.resid
## 19297.2 19371.5 -9636.6 19273.2 3604
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 1.094e+00 1.046e+00
## YearC 2.976e-01 5.456e-01 0.64
## ID:Time (Intercept) 9.484e-09 9.739e-05
## Number of obs: 3616, groups: ID, 818; ID:Time, 3616
##
## Overdispersion parameter for nbinom2 family (): 3.87
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.55715 0.06323 24.625 < 2e-16 ***
## GenderWomen -0.38236 0.08250 -4.635 3.57e-06 ***
## YearC -0.22259 0.04658 -4.779 1.76e-06 ***
## GenderWomen:YearC -0.19707 0.05869 -3.358 0.000786 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0617 0.2430 -12.598 <2e-16 ***
## GenderWomen 0.2126 0.2901 0.733 0.4638
## YearC 0.4406 0.1936 2.275 0.0229 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# add dispersion over time
summary(mznb2a <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID), data = dta3,
zi = ~ YearC,
dispformula = ~ YearC,
family = nbinom2))
## Family: nbinom2 ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID)
## Zero inflation: ~YearC
## Dispersion: ~YearC
## Data: dta3
##
## AIC BIC logLik deviance df.resid
## 19291.0 19359.1 -9634.5 19269.0 3605
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 1.0920 1.0450
## YearC 0.2881 0.5367 0.66
## Number of obs: 3616, groups: ID, 818
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.56590 0.06304 24.842 < 2e-16 ***
## GenderWomen -0.39309 0.08133 -4.833 1.34e-06 ***
## YearC -0.22937 0.04761 -4.817 1.45e-06 ***
## GenderWomen:YearC -0.19880 0.05828 -3.411 0.000647 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9281 0.1601 -18.289 < 2e-16 ***
## YearC 0.5540 0.2024 2.738 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Dispersion model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.39174 0.07603 18.304 <2e-16 ***
## YearC 0.24835 0.11268 2.204 0.0275 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# model comparisions - across different models
# need package bbmle
AICtab(mnb1, mnb2, mznb2, mznb2a, base = T, logLik = T)
## logLik AIC dLogLik dAIC df
## mznb2a -9634.5 19291.0 64.5 0.0 11
## mznb2 -9636.6 19297.2 62.4 6.2 12
## mnb2 -9686.8 19391.5 12.2 100.6 9
## mnb1 -9699.0 19416.0 0.0 125.0 9
# fortify data with residuals and fitted values
dta3$rs <- resid(mznb2a)
dta3$yhat <- predict(mznb2a)
## place observed and fitted CIs side-by-side
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
stat_summary(fun.data = "mean_cl_boot") +
stat_summary(aes(y = yhat), fun.data = "mean_cl_boot", pch = 20,
linetype = "dashed", position = position_dodge(width = .9)) +
labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
theme(legend.position = c(.1, .1))
# residual plot
ggplot(dta3, aes(x = yhat, y = scale(rs))) +
geom_point(col = "steelblue", alpha = I(0.2)) +
geom_hline(yintercept = 0, linetype = 2) +
facet_grid(Gender ~ Month) +
labs(x = "Fitted values", y = "Standardized residuals")