Load Libraries

library(tidyr)
# library(devtools)
# devtools::install_github("hlennon/LCTMtools")
library(LCTMtools)
library(lcmm)
library(ggplot2)
library(DT)

# LCTMtools is using an old version of dplyr, will create error message when using with updated package

Load Data

df <- read.csv(file="for_trajectory_analysis.csv", header = T)
df$AnonymousID <- as.factor(df$AnonymousID)
df$altid <- as.numeric(df$AnonymousID)
df2 <- subset(df, select=c(altid, id, courseid))

load(file = "LCTM_output.RData")

Screen Outliers

Code to standardize scores and remove participants with more than one score +/- |3|. Currently no outliers are removed.

data_wide <- spread(df2, courseid, id)
out_data <- cbind.data.frame(data_wide$altid, sapply(data_wide[2:9], scale, center = TRUE, scale = TRUE))
out_data$outs <- rowSums(out_data[2:9] < -3, na.rm = T)
table(out_data$outs)

## 
##   0   1   2 
## 278   4   1

data_wide$outs <- out_data$outs
# data_wide <- subset(data_wide, outs == 0, select=-c(outs))

data_long <- gather(data_wide, courseid, id, '1 40A Pre':'8 41 Post', factor_key = T)
ggplot(data_long, aes(x=courseid, y=id, group=altid)) +
  geom_line() + geom_point()

boxplot(data_long$id ~ data_long$courseid, ylab="Course" , xlab="Identity")

Recode Variables

Make numeric version of course ID.

data_long$courseidnum <- as.numeric(data_long$courseid)

Run LCTM

Determine fixed or random effects

Residual profile can be approximated by a flat line, so a random intercept, slope, or quadratic term should be considered instead of a fixed intercept/slope/term

Loading from RData file – code is currently commented out.

# model1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                random = ~-1,
#                subject = "altid",
#                ng = 1,
#                nwg = FALSE,
#                data = data.frame(data_long))
# 
rp <- residualplot_step1(model1,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = data_long,
                          ylimit=c(-3,3))

## "p1"

Determine number of classes

Test the models with different numbers of classes.

Initial values are specified using gridsearch - hlme is run for 30 courseidnums from 100 random vectors, and the estimation procedure is finalized for the initial value that produced the best log-likelihood.

Loading from RData file – code is currently commented out.

# set.seed(100)
# model1_v1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                   random = ~ 1 + courseidnum,
#                   subject = "altid",
#                   ng = 1,
#                   nwg = FALSE,
#                   data = data.frame(data_long))
# 
# model2_grid <- gridsearch(hlme(fixed = id ~ 1 + courseidnum + I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(data_long),
#                                ng = 2,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model3_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(data_long),
#                                ng = 3,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model4_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(data_long),
#                                ng = 4,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model5_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(data_long),
#                                ng = 5,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model6_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(data_long),
#                                ng = 6,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model7_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(data_long),
#                                ng = 7,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)

Once all models have been tested, remove all non-model output (e.g., anything not labeled model# or model#_grid) and save as an RData file. This will prevent having to re-run the models, and since there is an element of randomness in the process, will preserve the output

# rm(df, df2, data_long, data_wide, rp, out_data)
# save.image(file = "LCTM_output.RData")

Choose the best model

Choose the best model by comparing the fit indices and other output

npm = number of parameters - This corresponds to the additional class-specific parameters: the proportion of the class, the two Weibull parameters, and the three fixed effects for the quadratic trajectory (intercept, time and time squared).
AIC, BIC, SABIC
entropy
%class - identification of spurious groups

“The IC statistics take both model fit and complexity into consideration. Lower values indicate better trade-off between model fit and complexity. However, differences in the penalty functions (e.g., penalizing model complexity) of different model selection indices result in inconsistent class solutions (i.e., different indices may favor different class solutions). Previous research on mixture model estimation also suggests sample size plays a role in penalty calculations. First, as N approaches infinity, penalty / N should become closer to zero. Second, as N approaches infinity, log(N) / pen-alty should become closer to zero…Note that the CAIC, BIC, and SABIC fulfill these two conditions. AIC aims at finding the model that minimizes the Kullback–Leibler (K-L) criterion, selecting an approximate model, and providing better predictions of the population parameters On the contrary, BIC targets the “true” underlying model with the highest posterior probability.”

Chen, Q., Luo, W., Palardy, G. J., Glaman, R., & McEnturff, A. (2017). The efficacy of common fit indices for enumerating classes in growth mixture models when nested data structure is ignored: A Monte Carlo study. Sage Open, 7(1), 2158244017700459.

Create Table for Fit Statistics

Two- and three-trajectory models have the best fit.

tab <- summarytable(
  model1, model2_grid, model3_grid, model4_grid, model5_grid,
  model6_grid, model7_grid,
  which = c("G", "loglik", "conv", "npm", "AIC", "BIC", "SABIC",
            "entropy", "%class"))

##             G    loglik conv npm      AIC      BIC    SABIC   entropy
## model1      1 -972.3521    1   4 1952.704 1967.286 1954.602 1.0000000
## model2_grid 2 -738.4755    1  11 1498.951 1539.051 1504.170 0.6499120
## model3_grid 3 -734.1364    1  15 1498.273 1552.954 1505.389 0.5651347
## model4_grid 4 -734.1364    1  19 1506.273 1575.536 1515.287 0.3495910
## model5_grid 5 -734.1364    1  23 1514.273 1598.118 1525.185 0.2846609
## model6_grid 6 -723.7845    1  27 1501.569 1599.996 1514.379 0.4546632
## model7_grid 7 -723.7845    1  31 1509.569 1622.578 1524.277 0.4248759
##                %class1  %class2   %class3  %class4  %class5  %class6 %class7
## model1      100.000000                                                      
## model2_grid   8.127208 91.87279                                             
## model3_grid  64.664311 27.20848  8.127208                                   
## model4_grid   8.833922  0.00000 58.657244 32.50883                          
## model5_grid   9.540636  0.00000 35.689046 54.77032  0.00000                 
## model6_grid   4.593640  0.00000  8.833922  0.00000 44.16961 42.40283        
## model7_grid   0.000000 44.16961 40.282686  4.59364 10.95406  0.00000       0

brks_a <- quantile(tab[,5], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_a <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

brks_b <- quantile(tab[,6], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_b <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

brks_s <- quantile(tab[,7], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_s <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

# lighter shade = lower value = better model
datatable(tab) %>%
  formatRound(columns = c(1:7), digits = 0) %>%
  formatRound(columns = c(8:ncol(tab)), digits = 2) %>%
  formatStyle("AIC", backgroundColor = styleInterval(brks_a, clrs_a)) %>%
  formatStyle("BIC", backgroundColor = styleInterval(brks_b, clrs_b))%>%
  formatStyle("SABIC", backgroundColor = styleInterval(brks_s, clrs_s))

Check Posterior Probability

“The posterior classification can be used to assess the goodness-of-fit of the model (for the selection of the number of latent classes for instance) and the discrimination of the latent classes. Many indicators can be derived from it (Proust-Limaet al.2014). The package lcmm provides two indicators in the function postprob

The proportion of subjects classified in each latent class with a posterior probability above 0.7, 0.8 and 0.9. This indicates the proportion of subjects not ambiguously classified in each latent class.

The posterior classification table as defined in Table 3 which computes the mean of the posterior probabilities of belonging to the latent class among the subjects classified a posteriori in each latent class. A perfect classification would provide ones in the diagonal and zeros elsewhere. In practice, high diagonal terms indicate a good discrimination of the population.”

Proust-Lima, C., Philipps, V., & Liquet, B. (2015). Estimation of extended mixed models using latent classes and latent processes: the R package lcmm. arXiv preprint arXiv:1503.00890.

postprob(model2_grid)

##  
## Posterior classification: 
##   class1 class2
## N  23.00 260.00
## %   8.13  91.87
##  
## Posterior classification table: 
##      --> mean of posterior probabilities in each class 
##         prob1  prob2
## class1 0.7713 0.2287
## class2 0.0817 0.9183
##  
## Posterior probabilities above a threshold (%): 
##          class1 class2
## prob>0.7  56.52  96.15
## prob>0.8  39.13  88.46
## prob>0.9  34.78  68.08
##

postprob(model3_grid)

##  
## Posterior classification: 
##   class1 class2 class3
## N 183.00  77.00  23.00
## %  64.66  27.21   8.13
##  
## Posterior classification table: 
##      --> mean of posterior probabilities in each class 
##         prob1  prob2  prob3
## class1 0.8562 0.0874 0.0564
## class2 0.1831 0.7139 0.1029
## class3 0.1197 0.0969 0.7834
##  
## Posterior probabilities above a threshold (%): 
##          class1 class2 class3
## prob>0.7  84.15  54.55  65.22
## prob>0.8  73.22  33.77  43.48
## prob>0.9  53.01  16.88  30.43
##

Re-Test Effects Structure & Terms

residualplot_step1_v2 <- function(model, nameofoutcome, nameofage, data, ylimit = c(-50, 50)) {
    require(dplyr)
    k <- model$ng
    preds <- model$pred
    names(preds)[6] <- nameofoutcome
    nameofid <- names(model$pred)[1]
    test <- dplyr::left_join(preds, model$pprob, .by = nameofid)
    test <- dplyr::left_join(test, data, .by = c(nameofid, nameofoutcome))
    plotvalues <- NULL
    for (i in 1:k) {
        newplotvalues <- test %>% filter(class == i) %>% mutate(Residuals = get(nameofoutcome) - 
            eval(parse(text = paste0("pred_ss", i))))
        plotvalues <- rbind(plotvalues, newplotvalues)
        plotvaluessub <- plotvalues %>% filter(class == i)
        pname <- paste0("p", i)
        assign(pname, ggplot2::ggplot(data = plotvaluessub, aes(x = get(nameofage), 
            y = Residuals, group = class)) + theme(axis.text = element_text(size = 16), 
            text = element_text(size = 16)) + geom_point() + 
            stat_summary(fun = mean, geom = "line", size = 3, 
                col = "CadetBlue", group = 1) + ggtitle(paste("Residuals in class", i)) + ylim(ylimit))
        print(eval(parse(text = (paste0("p", i)))))
        plotname <- paste0("p", i, ".png")
        ggplot2::ggsave(filename = plotname)
    }
    return(as.list(mget(paste0("p", 1:k))))
}

rp2 <- residualplot_step1_v2(model2_grid,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = data_long,
                          ylimit=c(-3,3))

rp3 <- residualplot_step1_v2(model3_grid,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = data_long,
                          ylimit=c(-3,3))

Visualize Solutions

Two-Trajectory Model

classes_2fac <- as.data.frame(model2_grid$pprob[,1:2])

df_2fac <- merge(df, classes_2fac, by = "altid")
table(classes_2fac$class)

## 
##   1   2 
##  23 260

df_2fac$classn[df_2fac$class == "1"] <- "Class 1, n = 23"
df_2fac$classn[df_2fac$class == "2"] <- "Class 2, n = 260"

df_2fac$classn <- as.factor(df_2fac$classn)

ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))

Three-Trajectory Model

classes_3fac <- as.data.frame(model3_grid$pprob[,1:2])

df_3fac <- merge(df, classes_3fac, by = "altid")
table(classes_3fac$class)

## 
##   1   2   3 
## 183  77  23

df_3fac$classn[df_3fac$class == "1"] <- "Class 1, n = 183"
df_3fac$classn[df_3fac$class == "2"] <- "Class 2, n = 77"
df_3fac$classn[df_3fac$class == "3"] <- "Class 3, n = 23"

df_3fac$classn <- as.factor(df_3fac$classn)
df_3fac$courseidnum <- as.numeric(as.factor(df_3fac$courseid))

ggplot(df_3fac, aes(x=courseidnum, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_3fac, aes(x=courseidnum, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))

Kiwi - Preliminary Trajectory Analysis

Heather Perkins

2024-02-20