library(tidyr)
library(tidyverse)
# library(devtools)
# devtools::install_github("hlennon/LCTMtools")
library(LCTMtools)
library(lcmm)
library(ggplot2)
library(DT)
# LCTMtools is using an old version of dplyr, will create error message when using with updated package
df <- read.csv(file="for_trajectory_analysis_updated 5-16.csv", header = T)
df$AnonymousID <- as.factor(df$AnonymousID)
df$altid <- as.numeric(df$AnonymousID)
df2 <- subset(df, select=c(altid, id, courseid))
# Define the relevant course IDs
courseid_40A = "2 40A Post"
courseid_40B = "3 40B Pre"
# Create the new courseid with averaged or existing scores
df2_new <- df2 %>%
group_by(altid) %>%
mutate(id = ifelse(courseid %in% c(courseid_40A, courseid_40B),
mean(id[courseid %in% c(courseid_40A, courseid_40B)], na.rm = TRUE), id),
courseid = ifelse(courseid %in% c(courseid_40A, courseid_40B), "2.5 40A40B", courseid)) %>%
ungroup() %>%
distinct() # Remove duplicates
# Define the relevant course IDs
courseid_40B = "4 40B Post"
courseid_40C = "5 40C Pre"
# Create the new courseid with averaged or existing scores
df2_new2 <- df2_new %>%
group_by(altid) %>%
mutate(id = ifelse(courseid %in% c(courseid_40B, courseid_40C),
mean(id[courseid %in% c(courseid_40B, courseid_40C)], na.rm = TRUE), id),
courseid = ifelse(courseid %in% c(courseid_40B, courseid_40C), "4.5 40B40C", courseid)) %>%
ungroup() %>%
distinct() # Remove duplicates
# Define the relevant course IDs
courseid_40B = "2.5 40A40B"
courseid_40C = "4.5 40B40C"
# Create the new courseid with averaged or existing scores
df2_new3 <- df2_new2 %>%
group_by(altid) %>%
mutate(id = ifelse(courseid %in% c(courseid_40B, courseid_40C),
mean(id[courseid %in% c(courseid_40B, courseid_40C)], na.rm = TRUE), id),
courseid = ifelse(courseid %in% c(courseid_40B, courseid_40C), "3 mid", courseid)) %>%
ungroup() %>%
distinct() # Remove duplicates
df3 <- df2_new3 %>%
group_by(altid, courseid) %>%
summarise(id = mean(id), .groups = 'drop')
df3 %>%
summarise(unique_altid_count = n_distinct(altid))
## # A tibble: 1 x 1
## unique_altid_count
## <int>
## 1 1323
load(file = "LCTM_outputv3.RData")
Code to standardize scores and remove participants with more than one score +/- |3|. Currently no outliers are removed.
df4 <- cbind.data.frame(df3$altid, df3$courseid, df3$id, scale(df3$id, center=T, scale=T))
colnames(df4)[1:3] <- colnames(df3)
colnames(df4)[4] <- "std_id"
data_wide <- subset(df4, select=-c(3)) %>%
pivot_wider(names_from = courseid, values_from = std_id, values_fn = mean)
data_wide2 <- na.omit(data_wide)
data_wide$outs <- rowSums(data_wide[2:4] < -3, na.rm = T) + rowSums(data_wide[2:4] > 3, na.rm = T)
table(data_wide$outs)
##
## 0 1
## 1314 9
# data_wide <- subset(data_wide, outs == 0, select=-c(outs))
df5 <- subset(df4, df4$altid %in% data_wide2$altid)
boxplot(df4$id ~ df4$courseid, ylab="Course" , xlab="Identity")
boxplot(df5$id ~ df5$courseid, ylab="Course" , xlab="Identity")
Make numeric version of course ID.
df5$courseid <- as.factor(df5$courseid)
df5$courseidnum <- as.numeric(df5$courseid)
Residual profile can be approximated by a flat line, so a random intercept, slope, or quadratic term should be considered instead of a fixed intercept/slope/term
Loading from RData file – code is currently commented out.
# model1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~-1,
# subject = "altid",
# ng = 1,
# nwg = FALSE,
# data = data.frame(df5))
rp <- residualplot_step1(model1,
nameofoutcome="id", nameofage = "courseid",
data = df5,
ylimit=c(-3,3))
## "p1"
Test the models with different numbers of classes.
Initial values are specified using gridsearch - hlme is run for 30 courseidnums from 100 random vectors, and the estimation procedure is finalized for the initial value that produced the best log-likelihood.
Loading from RData file – code is currently commented out.
# set.seed(100)
# model1_v1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# ng = 1,
# nwg = FALSE,
# data = data.frame(df5))
#
# model2_grid <- gridsearch(hlme(fixed = id ~ 1 + courseidnum + I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 2,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model3_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 3,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model4_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 4,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model5_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 5,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model6_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 6,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model7_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 7,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model8_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 8,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
#
# model9_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
# random = ~ 1 + courseidnum,
# subject = "altid",
# data = data.frame(df5),
# ng = 9,
# mixture = ~1+courseidnum+I(courseidnum^2)),
# rep=100,
# maxiter=30,
# minit=model1_v1)
Once all models have been tested, remove all non-model output (e.g., anything not labeled model# or model#_grid) and save as an RData file. This will prevent having to re-run the models, and since there is an element of randomness in the process, will preserve the output
# rm(df2, df3, rp, data_wide, out_data)
# save.image(file = "LCTM_outputv3.RData")
Choose the best model by comparing the fit indices and other output
“The IC statistics take both model fit and complexity into consideration. Lower values indicate better trade-off between model fit and complexity. However, differences in the penalty functions (e.g., penalizing model complexity) of different model selection indices result in inconsistent class solutions (i.e., different indices may favor different class solutions). Previous research on mixture model estimation also suggests sample size plays a role in penalty calculations. First, as N approaches infinity, penalty / N should become closer to zero. Second, as N approaches infinity, log(N) / pen-alty should become closer to zero…Note that the CAIC, BIC, and SABIC fulfill these two conditions. AIC aims at finding the model that minimizes the Kullback–Leibler (K-L) criterion, selecting an approximate model, and providing better predictions of the population parameters On the contrary, BIC targets the “true” underlying model with the highest posterior probability.”
Chen, Q., Luo, W., Palardy, G. J., Glaman, R., & McEnturff, A. (2017). The efficacy of common fit indices for enumerating classes in growth mixture models when nested data structure is ignored: A Monte Carlo study. Sage Open, 7(1), 2158244017700459.
Two- and three-trajectory models have the best fit.
tab <- summarytable(
model1, model2_grid, model3_grid, model4_grid,
model5_grid, model6_grid, model7_grid,
model8_grid, model9_grid,
which = c("G", "loglik", "conv", "npm", "AIC", "BIC", "SABIC",
"entropy", "%class"))
## G loglik conv npm AIC BIC SABIC entropy
## model1 1 -263.8827 1 4 535.7654 545.8958 533.2687 1.0000000
## model2_grid 2 -187.2404 1 11 396.4808 424.3394 389.6149 0.8665772
## model3_grid 3 -181.4809 1 15 392.9619 430.9509 383.5992 0.8964830
## model4_grid 4 -180.2929 1 19 398.5858 446.7052 386.7265 0.7932849
## model5_grid 5 -172.1333 1 23 390.2667 448.5165 375.9106 0.7941472
## model6_grid 6 -168.9788 1 27 391.9577 460.3379 375.1049 0.8272695
## model7_grid 7 -166.0604 1 31 394.1209 472.6315 374.7714 0.8465209
## model8_grid 8 -153.4963 1 35 376.9926 465.6336 355.1464 0.9154080
## model9_grid 9 -150.9043 1 39 379.8085 478.5799 355.4656 0.8994757
## %class1 %class2 %class3 %class4 %class5 %class6
## model1 100.000000
## model2_grid 90.322581 9.677419
## model3_grid 87.096774 3.225806 9.677419
## model4_grid 66.666667 20.430108 2.150538 10.752688
## model5_grid 1.075269 10.752688 2.150538 67.741935 18.279570
## model6_grid 27.956989 11.827957 39.784946 16.129032 2.150538 2.150538
## model7_grid 3.225806 33.333333 13.978495 2.150538 13.978495 16.129032
## model8_grid 11.827957 2.150538 13.978495 30.107527 18.279570 17.204301
## model9_grid 3.225806 26.881720 4.301075 17.204301 17.204301 3.225806
## %class7 %class8 %class9
## model1
## model2_grid
## model3_grid
## model4_grid
## model5_grid
## model6_grid
## model7_grid 17.204301
## model8_grid 3.225806 3.225806
## model9_grid 2.150538 13.978495 11.82796
brks_a <- quantile(tab[,5], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_a <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
{paste0("rgb(255,", ., ",", ., ")")}
brks_b <- quantile(tab[,6], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_b <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
{paste0("rgb(255,", ., ",", ., ")")}
brks_s <- quantile(tab[,7], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_s <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
{paste0("rgb(255,", ., ",", ., ")")}
# lighter shade = lower value = better model
datatable(tab) %>%
formatRound(columns = c(1:7), digits = 0) %>%
formatRound(columns = c(8:ncol(tab)), digits = 2) %>%
formatStyle("AIC", backgroundColor = styleInterval(brks_a, clrs_a)) %>%
formatStyle("BIC", backgroundColor = styleInterval(brks_b, clrs_b))%>%
formatStyle("SABIC", backgroundColor = styleInterval(brks_s, clrs_s))
“The posterior classification can be used to assess the goodness-of-fit of the model (for the selection of the number of latent classes for instance) and the discrimination of the latent classes. Many indicators can be derived from it (Proust-Limaet al.2014). The package lcmm provides two indicators in the function postprob
The proportion of subjects classified in each latent class with a posterior probability above 0.7, 0.8 and 0.9. This indicates the proportion of subjects not ambiguously classified in each latent class.
The posterior classification table as defined in Table 3 which computes the mean of the posterior probabilities of belonging to the latent class among the subjects classified a posteriori in each latent class. A perfect classification would provide ones in the diagonal and zeros elsewhere. In practice, high diagonal terms indicate a good discrimination of the population.”
Proust-Lima, C., Philipps, V., & Liquet, B. (2015). Estimation of extended mixed models using latent classes and latent processes: the R package lcmm. arXiv preprint arXiv:1503.00890.
postprob(model8_grid)
##
## Posterior classification:
## class1 class2 class3 class4 class5 class6 class7 class8
## N 11.00 2.00 13.00 28.00 17.00 16.0 3.00 3.00
## % 11.83 2.15 13.98 30.11 18.28 17.2 3.23 3.23
##
## Posterior classification table:
## --> mean of posterior probabilities in each class
## prob1 prob2 prob3 prob4 prob5 prob6 prob7 prob8
## class1 0.8901 0 0.0000 0.0930 0.0141 0.0028 0.0000 0.0000
## class2 0.0000 1 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## class3 0.0000 0 0.9769 0.0000 0.0228 0.0000 0.0003 0.0000
## class4 0.0194 0 0.0000 0.9525 0.0152 0.0106 0.0000 0.0023
## class5 0.0015 0 0.0183 0.0819 0.8981 0.0000 0.0002 0.0000
## class6 0.0287 0 0.0000 0.0207 0.0000 0.9507 0.0000 0.0000
## class7 0.0000 0 0.0342 0.0000 0.0177 0.0000 0.9480 0.0000
## class8 0.0000 0 0.0000 0.0016 0.0000 0.0000 0.0000 0.9984
##
## Posterior probabilities above a threshold (%):
## class1 class2 class3 class4 class5 class6 class7 class8
## prob>0.7 81.82 100 100 96.43 88.24 93.75 100.00 100
## prob>0.8 81.82 100 100 96.43 88.24 87.50 100.00 100
## prob>0.9 63.64 100 100 89.29 76.47 87.50 66.67 100
##
postprob(model5_grid)
##
## Posterior classification:
## class1 class2 class3 class4 class5
## N 1.00 10.00 2.00 63.00 17.00
## % 1.08 10.75 2.15 67.74 18.28
##
## Posterior classification table:
## --> mean of posterior probabilities in each class
## prob1 prob2 prob3 prob4 prob5
## class1 1 0.0000 0.0000 0.0000 0.0000
## class2 0 0.8581 0.0000 0.1061 0.0358
## class3 0 0.0000 0.9518 0.0000 0.0482
## class4 0 0.0299 0.0000 0.8649 0.1051
## class5 0 0.0139 0.0193 0.1058 0.8610
##
## Posterior probabilities above a threshold (%):
## class1 class2 class3 class4 class5
## prob>0.7 100 80 100 79.37 94.12
## prob>0.8 100 70 100 73.02 76.47
## prob>0.9 100 70 100 57.14 41.18
##
postprob(model2_grid)
##
## Posterior classification:
## class1 class2
## N 84.00 9.00
## % 90.32 9.68
##
## Posterior classification table:
## --> mean of posterior probabilities in each class
## prob1 prob2
## class1 0.9800 0.0200
## class2 0.1358 0.8642
##
## Posterior probabilities above a threshold (%):
## class1 class2
## prob>0.7 98.81 88.89
## prob>0.8 98.81 88.89
## prob>0.9 91.67 55.56
##
residualplot_step1_v2 <- function(model, nameofoutcome, nameofage, data, ylimit = c(-50, 50)) {
require(dplyr)
k <- model$ng
preds <- model$pred
names(preds)[6] <- nameofoutcome
nameofid <- names(model$pred)[1]
test <- dplyr::left_join(preds, model$pprob, .by = nameofid)
test <- dplyr::left_join(test, data, .by = c(nameofid, nameofoutcome))
plotvalues <- NULL
for (i in 1:k) {
newplotvalues <- test %>% filter(class == i) %>% mutate(Residuals = get(nameofoutcome) -
eval(parse(text = paste0("pred_ss", i))))
plotvalues <- rbind(plotvalues, newplotvalues)
plotvaluessub <- plotvalues %>% filter(class == i)
pname <- paste0("p", i)
assign(pname, ggplot2::ggplot(data = plotvaluessub, aes(x = get(nameofage),
y = Residuals, group = class)) + theme(axis.text = element_text(size = 16),
text = element_text(size = 16)) + geom_point() +
stat_summary(fun = mean, geom = "line", size = 3,
col = "CadetBlue", group = 1) + ggtitle(paste("Residuals in class", i)) + ylim(ylimit))
print(eval(parse(text = (paste0("p", i)))))
plotname <- paste0("p", i, ".png")
ggplot2::ggsave(filename = plotname)
}
return(as.list(mget(paste0("p", 1:k))))
}
rp2 <- residualplot_step1_v2(model8_grid,
nameofoutcome="id", nameofage = "courseid",
data = df5,
ylimit=c(-3,3))
rp2 <- residualplot_step1_v2(model7_grid,
nameofoutcome="id", nameofage = "courseid",
data = df5,
ylimit=c(-3,3))
rp2 <- residualplot_step1_v2(model2_grid,
nameofoutcome="id", nameofage = "courseid",
data = df5,
ylimit=c(-3,3))
classes_2fac <- as.data.frame(model2_grid$pprob[,1:2])
df_2fac <- merge(df4, classes_2fac, by = "altid")
table(classes_2fac$class)
##
## 1 2
## 84 9
df_2fac$classn[df_2fac$class == "1"] <- "Class 1, n = 84"
df_2fac$classn[df_2fac$class == "2"] <- "Class 2, n = 9"
df_2fac$classn <- as.factor(df_2fac$classn)
ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))
classes_5fac <- as.data.frame(model5_grid$pprob[,1:2])
df_5fac <- merge(df4, classes_5fac, by = "altid")
table(classes_5fac$class)
##
## 1 2 3 4 5
## 1 10 2 63 17
df_5fac$classn[df_5fac$class == "1"] <- "Class 1, n = 1"
df_5fac$classn[df_5fac$class == "2"] <- "Class 2, n = 10"
df_5fac$classn[df_5fac$class == "3"] <- "Class 3, n = 2"
df_5fac$classn[df_5fac$class == "4"] <- "Class 4, n = 63"
df_5fac$classn[df_5fac$class == "5"] <- "Class 5, n = 17"
df_5fac$classn <- as.factor(df_5fac$classn)
ggplot(df_5fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_5fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))
classes_6fac <- as.data.frame(model6_grid$pprob[,1:2])
df_6fac <- merge(df4, classes_6fac, by = "altid")
table(classes_6fac$class)
##
## 1 2 3 4 5 6
## 26 11 37 15 2 2
df_6fac$classn[df_6fac$class == "1"] <- "Class 1, n = 26"
df_6fac$classn[df_6fac$class == "2"] <- "Class 2, n = 11"
df_6fac$classn[df_6fac$class == "3"] <- "Class 3, n = 37"
df_6fac$classn[df_6fac$class == "4"] <- "Class 4, n = 15"
df_6fac$classn[df_6fac$class == "5"] <- "Class 5, n = 2"
df_6fac$classn[df_6fac$class == "6"] <- "Class 6, n = 2"
df_6fac$classn <- as.factor(df_6fac$classn)
ggplot(df_6fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_6fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))
classes_7fac <- as.data.frame(model7_grid$pprob[,1:2])
df_7fac <- merge(df4, classes_7fac, by = "altid")
table(classes_7fac$class)
##
## 1 2 3 4 5 6 7
## 3 31 13 2 13 15 16
df_7fac$classn[df_7fac$class == "1"] <- "Class 1, n = 3"
df_7fac$classn[df_7fac$class == "2"] <- "Class 2, n = 31"
df_7fac$classn[df_7fac$class == "3"] <- "Class 3, n = 13"
df_7fac$classn[df_7fac$class == "4"] <- "Class 4, n = 2"
df_7fac$classn[df_7fac$class == "5"] <- "Class 5, n = 13"
df_7fac$classn[df_7fac$class == "6"] <- "Class 6, n = 15"
df_7fac$classn[df_7fac$class == "7"] <- "Class 7, n = 16"
df_7fac$classn <- as.factor(df_7fac$classn)
ggplot(df_7fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_7fac, aes(x=courseid, y=id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))
classes_8fac <- as.data.frame(model8_grid$pprob[,1:2])
df_8fac <- merge(df4, classes_8fac, by = "altid")
table(classes_8fac$class)
##
## 1 2 3 4 5 6 7 8
## 11 2 13 28 17 16 3 3
df_8fac$classn[df_8fac$class == "1"] <- "Class 1, n = 11"
df_8fac$classn[df_8fac$class == "2"] <- "Class 2, n = 2"
df_8fac$classn[df_8fac$class == "3"] <- "Class 3, n = 13"
df_8fac$classn[df_8fac$class == "4"] <- "Class 4, n = 28"
df_8fac$classn[df_8fac$class == "5"] <- "Class 5, n = 17"
df_8fac$classn[df_8fac$class == "6"] <- "Class 6, n = 16"
df_8fac$classn[df_8fac$class == "7"] <- "Class 7, n = 3"
df_8fac$classn[df_8fac$class == "8"] <- "Class 8, n = 3"
df_8fac$classn <- as.factor(df_8fac$classn)
ggplot(df_8fac, aes(x=courseid, y=std_id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=F, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment")
ggplot(df_8fac, aes(x=courseid, y=std_id, group=altid, color=classn)) +
geom_line(alpha = .25) +
geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))