1 data input

# load or install packages to be used
pacman::p_load(tidyverse, lme4, glmmTMB, bbmle)

dta3 <- read.csv("rapi.csv")

names(dta3) <- c("ID","Time","Gender", "Month", "RAPI")

str(dta3)
## 'data.frame':    3616 obs. of  5 variables:
##  $ ID    : chr  "S1001" "S1001" "S1001" "S1002" ...
##  $ Time  : int  1 2 3 1 2 3 4 5 1 2 ...
##  $ Gender: chr  "Men" "Men" "Men" "Women" ...
##  $ Month : int  0 6 18 0 6 12 18 24 0 12 ...
##  $ RAPI  : int  0 0 0 3 6 5 4 5 9 1 ...
head(dta3)
##      ID Time Gender Month RAPI
## 1 S1001    1    Men     0    0
## 2 S1001    2    Men     6    0
## 3 S1001    3    Men    18    0
## 4 S1002    1  Women     0    3
## 5 S1002    2  Women     6    6
## 6 S1002    3  Women    12    5

2 data management

# covert to factor type and add variables for later use
dta3$YearC <- (dta3$Month - mean(unique(dta3$Month))) / 12

# numerical summaries
dim(dta3)
## [1] 3616    6
length(unique(dta3$ID))
## [1] 818
table(table(dta3$ID))
## 
##   1   2   3   4   5 
##  25  38  66 128 561

3 summarize RAPI

#
dta3 %>%
 group_by(Gender, Time) %>%
 summarize( mean(RAPI), var(RAPI), sum(RAPI < 1)/n()) %>%
 as.data.frame
## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)
##    Gender Time mean(RAPI) var(RAPI) sum(RAPI < 1)/n()
## 1     Men    1   7.700288  73.80587        0.11815562
## 2     Men    2   8.563253 123.72710        0.14156627
## 3     Men    3   7.632588 125.04727        0.19488818
## 4     Men    4   7.861210 154.37710        0.22419929
## 5     Men    5   7.464789 127.50465        0.26291080
## 6   Women    1   6.335456  49.15957        0.09766454
## 7   Women    2   5.624729  60.65669        0.20173536
## 8   Women    3   4.490950  39.14844        0.25791855
## 9   Women    4   4.936275  63.54875        0.28186275
## 10  Women    5   4.422414  64.45218        0.34482759
# set plot theme
ot <- theme_set(theme_bw())

4 plot RAPI and month by gender (overall)

#..density..
ggplot(dta3, aes(RAPI, ..density..)) +
 stat_bin(binwidth = 0.9) +
 facet_grid(Gender ~ as.factor(Month)) +
 labs(x = "Rutgers Alcohol Problem Index")

#glm
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_se") +
 stat_smooth(method = "glm", method.args = list(family = poisson)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.15, .15))
## `geom_smooth()` using formula 'y ~ x'

5 individual plots

# Randomly select about 1/3 individuals
# set random seed for reproduction
set.seed(11092016)

# one-third of sample
dta3$ID <-factor(dta3$ID)
n3 <- sample(dta3$ID, round(length(levels(dta3$ID))*1/3))

# 
ggplot(dta3[(dta3$ID %in% n3), ], aes(Month, RAPI, group = ID)) +
  geom_line(alpha = .3) +
  geom_point(alpha = .3) +
  stat_smooth(aes(group = 1), method = "glm",
              method.args = list(family = poisson))+
  facet_wrap( ~ Gender) +
  labs(x = "Time (in months)", y = "Rutgers Alcohol Problem Index") +
  theme(legend.position = "NONE")
## `geom_smooth()` using formula 'y ~ x'

alcohol problemm index 個體間的差異很大

6 model

# Model with  interaction and random cluster by ID
summary(m0 <- glmer(RAPI ~ Gender*YearC + (1 | ID), data = dta3,
                           family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (1 | ID)
##    Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##    24094    24125   -12042    24084     3611 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7683 -1.0923 -0.3250  0.7538 15.4325 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  ID     (Intercept) 1.04     1.02    
## Number of obs: 3616, groups:  ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.55567    0.05691  27.337  < 2e-16 ***
## GenderWomen       -0.34717    0.07502  -4.628 3.70e-06 ***
## YearC             -0.01429    0.01338  -1.068    0.285    
## GenderWomen:YearC -0.12146    0.01910  -6.358 2.05e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.755              
## YearC        0.030 -0.023       
## GndrWmn:YrC -0.021  0.035 -0.700
# add individual observation variability
summary(m1 <- glmer(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
                    data = dta3, family = poisson))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
##    Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19448.0  19497.5  -9716.0  19432.0     3608 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.03681 -0.53730 -0.03497  0.24218  1.29823 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr
##  ID:Time (Intercept) 0.3760   0.6132       
##  ID      (Intercept) 1.1197   1.0581       
##          YearC       0.3352   0.5790   0.64
## Number of obs: 3616, groups:  ID:Time, 3616; ID, 818
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.32445    0.06265  21.140  < 2e-16 ***
## GenderWomen       -0.39203    0.08215  -4.772 1.82e-06 ***
## YearC             -0.26335    0.04620  -5.700 1.20e-08 ***
## GenderWomen:YearC -0.19135    0.06025  -3.176  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrWm YearC 
## GenderWomen -0.753              
## YearC        0.498 -0.371       
## GndrWmn:YrC -0.373  0.497 -0.739
# add random slopes uncorrelated with intercepts
m2 <- glmer(RAPI ~ Gender*YearC + (YearC - 1 | ID) + (1 | ID) + 
               (1 | ID:Time),
               data = dta3, family = poisson)

## compare nested models - max-likelihood
anova(m0, m1, m2)
## Data: dta3
## Models:
## m0: RAPI ~ Gender * YearC + (1 | ID)
## m2: RAPI ~ Gender * YearC + (YearC - 1 | ID) + (1 | ID) + (1 | ID:Time)
## m1: RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
##    npar   AIC   BIC   logLik deviance   Chisq Df Pr(>Chisq)    
## m0    5 24094 24125 -12042.0    24084                          
## m2    7 19563 19606  -9774.3    19549 4535.51  2  < 2.2e-16 ***
## m1    8 19448 19498  -9716.0    19432  116.54  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m1配適度最好(AIC最小)且顯著與m0不同

7 residual exam

# residual plot
plot(m2, pch = 20, cex = .6, xlab = "Fitted values")

## Residuals over time
plot(m2, resid(.) ~ fitted(.) | as.factor(Month) + Gender, abline = c(h = 0),
     type = c("p", "smooth"), pch = 20, cex = .6,
     xlab = "Fitted values", ylab = "Pearson residuals")

# normality check
lattice::qqmath(m2, type = c("p", "g"))

# unit-specific rate ratios
print(exp(fixef(m2)), 3)
##       (Intercept)       GenderWomen             YearC GenderWomen:YearC 
##             3.835             0.684             0.844             0.835
#
VarCorr(m2)
##  Groups  Name        Std.Dev.
##  ID.Time (Intercept) 0.61780 
##  ID      (Intercept) 0.99997 
##  ID.1    YearC       0.51389

8 negative binomial

# mean linear related to variance
summary(mnb1 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
                          data = dta3, family = nbinom1))
##  Family: nbinom1  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19416.0  19471.8  -9699.0  19398.0     3607 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev. Corr 
##  ID      (Intercept) 0.9538   0.9766        
##          YearC       0.2497   0.4997   0.64 
##  ID:Time (Intercept) 0.1922   0.4384        
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom1 family (): 1.06 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.44252    0.06147  23.468  < 2e-16 ***
## GenderWomen       -0.37184    0.07719  -4.817 1.45e-06 ***
## YearC             -0.24691    0.04335  -5.695 1.23e-08 ***
## GenderWomen:YearC -0.17239    0.05613  -3.071  0.00213 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# variance to mean with a quadratic trend
mnb2 <- update(mnb1, . ~ ., family = nbinom2)

summary
## standardGeneric for "summary" defined from package "base"
## 
## function (object, ...) 
## standardGeneric("summary")
## <environment: 0x00000000130cb9d0>
## Methods may be defined for arguments: object
## Use  showMethods("summary")  for currently available ones.

9 zero-inflated negative binomial

# Include zero-inflation term
summary(mznb2 <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID) + (1 | ID:Time),
                          zi = ~ Gender + YearC,
                          data = dta3, family = nbinom2))
##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID) + (1 | ID:Time)
## Zero inflation:        ~Gender + YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19297.2  19371.5  -9636.6  19273.2     3604 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance  Std.Dev.  Corr 
##  ID      (Intercept) 1.094e+00 1.046e+00      
##          YearC       2.976e-01 5.456e-01 0.64 
##  ID:Time (Intercept) 9.484e-09 9.739e-05      
## Number of obs: 3616, groups:  ID, 818; ID:Time, 3616
## 
## Overdispersion parameter for nbinom2 family (): 3.87 
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.55715    0.06323  24.625  < 2e-16 ***
## GenderWomen       -0.38236    0.08250  -4.635 3.57e-06 ***
## YearC             -0.22259    0.04658  -4.779 1.76e-06 ***
## GenderWomen:YearC -0.19707    0.05869  -3.358 0.000786 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.0617     0.2430 -12.598   <2e-16 ***
## GenderWomen   0.2126     0.2901   0.733   0.4638    
## YearC         0.4406     0.1936   2.275   0.0229 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# add dispersion over time
summary(mznb2a <- glmmTMB(RAPI ~ Gender*YearC + (YearC | ID), data = dta3,
                           zi = ~ YearC,
                           dispformula = ~ YearC,
                           family = nbinom2))
##  Family: nbinom2  ( log )
## Formula:          RAPI ~ Gender * YearC + (YearC | ID)
## Zero inflation:        ~YearC
## Dispersion:            ~YearC
## Data: dta3
## 
##      AIC      BIC   logLik deviance df.resid 
##  19291.0  19359.1  -9634.5  19269.0     3605 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  ID     (Intercept) 1.0920   1.0450        
##         YearC       0.2881   0.5367   0.66 
## Number of obs: 3616, groups:  ID, 818
## 
## Conditional model:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.56590    0.06304  24.842  < 2e-16 ***
## GenderWomen       -0.39309    0.08133  -4.833 1.34e-06 ***
## YearC             -0.22937    0.04761  -4.817 1.45e-06 ***
## GenderWomen:YearC -0.19880    0.05828  -3.411 0.000647 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.9281     0.1601 -18.289  < 2e-16 ***
## YearC         0.5540     0.2024   2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Dispersion model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.39174    0.07603  18.304   <2e-16 ***
## YearC        0.24835    0.11268   2.204   0.0275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# model comparisions - across different models
# need package bbmle
AICtab(mnb1, mnb2, mznb2, mznb2a, base = T, logLik = T)
##        logLik  AIC     dLogLik dAIC    df
## mznb2a -9634.5 19291.0    64.5     0.0 11
## mznb2  -9636.6 19297.2    62.4     6.2 12
## mnb2   -9686.8 19391.5    12.2   100.6 9 
## mnb1   -9699.0 19416.0     0.0   125.0 9

10 diagnostics

# fortify data with residuals and fitted values
dta3$rs <- resid(mznb2a)
dta3$yhat <- predict(mznb2a)

## place observed and fitted CIs side-by-side
ggplot(dta3, aes(Month, RAPI, col = Gender)) +
 stat_summary(fun.data = "mean_cl_boot") +
 stat_summary(aes(y = yhat), fun.data = "mean_cl_boot", pch = 20, 
              linetype = "dashed", position = position_dodge(width = .9)) +
 labs(y = "Mean Rutgers Alcohol Problem Index", x = "Time (in months)") +
 theme(legend.position = c(.1, .1))

# residual plot
ggplot(dta3, aes(x = yhat, y = scale(rs))) +
  geom_point(col = "steelblue", alpha = I(0.2)) +
  geom_hline(yintercept = 0, linetype = 2) +
  facet_grid(Gender ~ Month) +
  labs(x = "Fitted values", y = "Standardized residuals")