library(tidyr)
# library(devtools)
# devtools::install_github("hlennon/LCTMtools")
library(LCTMtools)
library(lcmm)
library(ggplot2)
library(DT)
# LCTMtools is using an old version of dplyr, will create error message when using with updated package
df <- read.csv(file="for_trajectory_analysis.csv", header = T)
df$AnonymousID <- as.factor(df$AnonymousID)
df$altid <- as.numeric(df$AnonymousID)
df2 <- subset(df, select=c(altid, id, courseid))
load(file = "LCTM_output.RData")
Code to standardize scores and remove participants with more than one score +/- |3|. Currently no outliers are removed.
data_wide <- spread(df2, courseid, id)
out_data <- cbind.data.frame(data_wide$altid, sapply(data_wide[2:9], scale, center = TRUE, scale = TRUE))
out_data$outs <- rowSums(out_data[2:9] < -3, na.rm = T)
table(out_data$outs)
##
## 0 1 2
## 278 4 1
data_wide$outs <- out_data$outs
# data_wide <- subset(data_wide, outs == 0, select=-c(outs))
data_long <- gather(data_wide, courseid, id, '1 40A Pre':'8 41 Post', factor_key = T)
ggplot(data_long, aes(x=courseid, y=id, group=altid)) +
geom_line() + geom_point()
boxplot(data_long$id ~ data_long$courseid, ylab="Course" , xlab="Identity")
Make numeric version of course ID.
data_long$courseidnum <- as.numeric(data_long$courseid)
Residual profile can be approximated by a flat line, so a random intercept, slope, or quadratic term should be considered instead of a fixed intercept/slope/term
Loading from RData file – code is currently commented out.
# model1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~-1,
# subject = "altid",
# ng = 1,
# nwg = FALSE,
# data = data.frame(data_long))
#
rp <- residualplot_step1(model1,
nameofoutcome="id", nameofage = "courseid",
data = data_long,
ylimit=c(-3,3))
## "p1"
Test the models with different numbers of classes.
Initial values are specified using gridsearch - hlme is run for 30 courseidnums from 100 random vectors, and the estimation procedure is finalized for the initial value that produced the best log-likelihood.
Loading from RData file – code is currently commented out.
# set.seed(100)
# model1_v1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# ng = 1,
# nwg = FALSE,
# data = data.frame(data_long))
#
# model2_grid <- gridsearch(hlme(fixed = id ~ 1 + courseidnum + I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(data_long),
# ng = 2,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model3_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(data_long),
# ng = 3,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model4_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(data_long),
# ng = 4,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model5_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(data_long),
# ng = 5,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model6_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(data_long),
# ng = 6,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model7_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(data_long),
# ng = 7,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
Once all models have been tested, remove all non-model output (e.g., anything not labeled model# or model#_grid) and save as an RData file. This will prevent having to re-run the models, and since there is an element of randomness in the process, will preserve the output
# rm(df, df2, data_long, data_wide, rp, out_data)
# save.image(file = "LCTM_output.RData")
Choose the best model by comparing the fit indices and other output
“The IC statistics take both model fit and complexity into consideration. Lower values indicate better trade-off between model fit and complexity. However, differences in the penalty functions (e.g., penalizing model complexity) of different model selection indices result in inconsistent class solutions (i.e., different indices may favor different class solutions). Previous research on mixture model estimation also suggests sample size plays a role in penalty calculations. First, as N approaches infinity, penalty / N should become closer to zero. Second, as N approaches infinity, log(N) / pen-alty should become closer to zero…Note that the CAIC, BIC, and SABIC fulfill these two conditions. AIC aims at finding the model that minimizes the Kullback–Leibler (K-L) criterion, selecting an approximate model, and providing better predictions of the population parameters On the contrary, BIC targets the “true” underlying model with the highest posterior probability.”
Chen, Q., Luo, W., Palardy, G. J., Glaman, R., & McEnturff, A. (2017). The efficacy of common fit indices for enumerating classes in growth mixture models when nested data structure is ignored: A Monte Carlo study. Sage Open, 7(1), 2158244017700459.
Two- and three-trajectory models have the best fit.
tab <- summarytable(
model1, model2_grid, model3_grid, model4_grid, model5_grid,
model6_grid, model7_grid,
which = c("G", "loglik", "conv", "npm", "AIC", "BIC", "SABIC",
"entropy", "%class"))
## G loglik conv npm AIC BIC SABIC entropy
## model1 1 -972.3521 1 4 1952.704 1967.286 1954.602 1.0000000
## model2_grid 2 -738.4755 1 11 1498.951 1539.051 1504.170 0.6499120
## model3_grid 3 -734.1364 1 15 1498.273 1552.954 1505.389 0.5651347
## model4_grid 4 -734.1364 1 19 1506.273 1575.536 1515.287 0.3495910
## model5_grid 5 -734.1364 1 23 1514.273 1598.118 1525.185 0.2846609
## model6_grid 6 -723.7845 1 27 1501.569 1599.996 1514.379 0.4546632
## model7_grid 7 -723.7845 1 31 1509.569 1622.578 1524.277 0.4248759
## %class1 %class2 %class3 %class4 %class5 %class6 %class7
## model1 100.000000
## model2_grid 8.127208 91.87279
## model3_grid 64.664311 27.20848 8.127208
## model4_grid 8.833922 0.00000 58.657244 32.50883
## model5_grid 9.540636 0.00000 35.689046 54.77032 0.00000
## model6_grid 4.593640 0.00000 8.833922 0.00000 44.16961 42.40283
## model7_grid 0.000000 44.16961 40.282686 4.59364 10.95406 0.00000 0
brks_a <- quantile(tab[,5], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_a <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
{paste0("rgb(255,", ., ",", ., ")")}
brks_b <- quantile(tab[,6], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_b <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
{paste0("rgb(255,", ., ",", ., ")")}
brks_s <- quantile(tab[,7], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_s <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
{paste0("rgb(255,", ., ",", ., ")")}
# lighter shade = lower value = better model
datatable(tab) %>%
formatRound(columns = c(1:7), digits = 0) %>%
formatRound(columns = c(8:ncol(tab)), digits = 2) %>%
formatStyle("AIC", backgroundColor = styleInterval(brks_a, clrs_a)) %>%
formatStyle("BIC", backgroundColor = styleInterval(brks_b, clrs_b))%>%
formatStyle("SABIC", backgroundColor = styleInterval(brks_s, clrs_s))
“The posterior classification can be used to assess the goodness-of-fit of the model (for the selection of the number of latent classes for instance) and the discrimination of the latent classes. Many indicators can be derived from it (Proust-Limaet al.2014). The package lcmm provides two indicators in the function postprob
The proportion of subjects classified in each latent class with a posterior probability above 0.7, 0.8 and 0.9. This indicates the proportion of subjects not ambiguously classified in each latent class.
The posterior classification table as defined in Table 3 which computes the mean of the posterior probabilities of belonging to the latent class among the subjects classified a posteriori in each latent class. A perfect classification would provide ones in the diagonal and zeros elsewhere. In practice, high diagonal terms indicate a good discrimination of the population.”
Proust-Lima, C., Philipps, V., & Liquet, B. (2015). Estimation of extended mixed models using latent classes and latent processes: the R package lcmm. arXiv preprint arXiv:1503.00890.
postprob(model2_grid)
##
## Posterior classification:
## class1 class2
## N 23.00 260.00
## % 8.13 91.87
##
## Posterior classification table:
## --> mean of posterior probabilities in each class
## prob1 prob2
## class1 0.7713 0.2287
## class2 0.0817 0.9183
##
## Posterior probabilities above a threshold (%):
## class1 class2
## prob>0.7 56.52 96.15
## prob>0.8 39.13 88.46
## prob>0.9 34.78 68.08
##
postprob(model3_grid)
##
## Posterior classification:
## class1 class2 class3
## N 183.00 77.00 23.00
## % 64.66 27.21 8.13
##
## Posterior classification table:
## --> mean of posterior probabilities in each class
## prob1 prob2 prob3
## class1 0.8562 0.0874 0.0564
## class2 0.1831 0.7139 0.1029
## class3 0.1197 0.0969 0.7834
##
## Posterior probabilities above a threshold (%):
## class1 class2 class3
## prob>0.7 84.15 54.55 65.22
## prob>0.8 73.22 33.77 43.48
## prob>0.9 53.01 16.88 30.43
##
residualplot_step1_v2 <- function(model, nameofoutcome, nameofage, data, ylimit = c(-50, 50)) {
require(dplyr)
k <- model$ng
preds <- model$pred
names(preds)[6] <- nameofoutcome
nameofid <- names(model$pred)[1]
test <- dplyr::left_join(preds, model$pprob, .by = nameofid)
test <- dplyr::left_join(test, data, .by = c(nameofid, nameofoutcome))
plotvalues <- NULL
for (i in 1:k) {
newplotvalues <- test %>% filter(class == i) %>% mutate(Residuals = get(nameofoutcome) -
eval(parse(text = paste0("pred_ss", i))))
plotvalues <- rbind(plotvalues, newplotvalues)
plotvaluessub <- plotvalues %>% filter(class == i)
pname <- paste0("p", i)
assign(pname, ggplot2::ggplot(data = plotvaluessub, aes(x = get(nameofage),
y = Residuals, group = class)) + theme(axis.text = element_text(size = 16),
text = element_text(size = 16)) + geom_point() +
stat_summary(fun = mean, geom = "line", size = 3,
col = "CadetBlue", group = 1) + ggtitle(paste("Residuals in class", i)) + ylim(ylimit))
print(eval(parse(text = (paste0("p", i)))))
plotname <- paste0("p", i, ".png")
ggplot2::ggsave(filename = plotname)
}
return(as.list(mget(paste0("p", 1:k))))
}
rp2 <- residualplot_step1_v2(model2_grid,
nameofoutcome="id", nameofage = "courseid",
data = data_long,
ylimit=c(-3,3))
rp3 <- residualplot_step1_v2(model3_grid,
nameofoutcome="id", nameofage = "courseid",
data = data_long,
ylimit=c(-3,3))
classes_2fac <- as.data.frame(model2_grid$pprob[,1:2])
df_2fac <- merge(df, classes_2fac, by = "altid")
table(classes_2fac$class)
##
## 1 2
## 23 260
df_2fac$classn[df_2fac$class == "1"] <- "Class 1, n = 23"
df_2fac$classn[df_2fac$class == "2"] <- "Class 2, n = 260"
df_2fac$classn <- as.factor(df_2fac$classn)
ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))
classes_3fac <- as.data.frame(model3_grid$pprob[,1:2])
df_3fac <- merge(df, classes_3fac, by = "altid")
table(classes_3fac$class)
##
## 1 2 3
## 183 77 23
df_3fac$classn[df_3fac$class == "1"] <- "Class 1, n = 183"
df_3fac$classn[df_3fac$class == "2"] <- "Class 2, n = 77"
df_3fac$classn[df_3fac$class == "3"] <- "Class 3, n = 23"
df_3fac$classn <- as.factor(df_3fac$classn)
df_3fac$courseidnum <- as.numeric(as.factor(df_3fac$courseid))
ggplot(df_3fac, aes(x=courseidnum, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_3fac, aes(x=courseidnum, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))