# data for LCTMload(file ="uniscreen_output.RData")# data for LCTM plotsdfc <-read.csv(file="classes_wide.csv", header = T)postprob <-read.csv(file="classes_pp.csv", header=T)postprob$probfin <- postprob[cbind(seq_len(nrow(postprob)), postprob$class +2)]dfc <-merge(dfc, subset(postprob, select=-c(2:8)), by="altid")# data for regressionload(file="finaldata.RData")
# # descriptives ------------------------------------------------------------# table(df1$time)# # df1$time <- as.character(df1$time)# df1$time[df1$time == "fall17"] <- 1# df1$time[df1$time == "winter18"] <- 2# df1$time[df1$time == "spring18"] <- 3# df1$time[df1$time == "summer18"] <- 4# df1$time[df1$time == "fall18"] <- 5# df1$time[df1$time == "winter19"] <- 6# df1$time[df1$time == "spring19"] <- 7# df1$time[df1$time == "summer19"] <- 8# df1$time[df1$time == "fall19"] <- 9# df1$time[df1$time == "winter20"] <- 10# df1$time[df1$time == "spring20"] <- 11# df1$time[df1$time == "summer20"] <- 12# df1$time <- as.numeric(df1$time)# df1$UID <- as.factor(df1$UID)# df1$altid <- as.numeric(df1$UID)# str(df1)# # describe(df)# # # determine fixed or random effects ---------------------------------------# model1 <- hlme(fixed = gpa~1+time+I(time^2),# random = ~-1,# subject = "altid",# ng = 1,# nwg = FALSE,# data = data.frame(df1))# # rp <- residualplot_step1( model1,# nameofoutcome="gpa", nameofage = "time",# data = df1,# ylimit=c(-3,3))# # rp + xlab("Semester")# # # residual profile can be approximated by a flat line, so a random intercept, slope, or quadratic term should be considered instead of a fixed intercept/slope/term# # # determine the number of classes -----------------------------------------# # test the models with different numbers of classes# # initial values are specified using gridsearch - hlme is run for 30 times from 100 random vectors, and the estimation procedure is finalized for the initial value that produced the best log-likelihood# # set.seed(100)# model1_v1 <- hlme(fixed = gpa~1+time+I(time^2),# random = ~ 1 + time,# subject = "altid",# ng = 1,# nwg = FALSE,# data = data.frame(df1))# # model2_grid <- gridsearch(hlme(fixed = gpa ~ 1 + time + I(time^2),# random = ~ 1 + time,# subject = "altid",# data = data.frame(df1),# ng = 2,# mixture = ~1+time+I(time^2)),# rep=100,# maxiter=30,# minit=model1_v1)# # model3_grid <- gridsearch(hlme(fixed = gpa~1+time+I(time^2),# random = ~ 1 + time,# subject = "altid",# data = data.frame(df1),# ng = 3,# mixture = ~1+time+I(time^2)),# rep=100,# maxiter=30,# minit=model1_v1)# # model4_grid <- gridsearch(hlme(fixed = gpa~1+time+I(time^2),# random = ~ 1 + time,# subject = "altid",# data = data.frame(df1),# ng = 4,# mixture = ~1+time+I(time^2)),# rep=100,# maxiter=30,# minit=model1_v1)# # model5_grid <- gridsearch(hlme(fixed = gpa~1+time+I(time^2),# random = ~ 1 + time,# subject = "altid",# data = data.frame(df1),# ng = 5,# mixture = ~1+time+I(time^2)),# rep=100,# maxiter=30,# minit=model1_v1)# # model6_grid <- gridsearch(hlme(fixed = gpa~1+time+I(time^2),# random = ~ 1 + time,# subject = "altid",# data = data.frame(df1),# ng = 6,# mixture = ~1+time+I(time^2)),# rep=100,# maxiter=30,# minit=model1_v1)# # model7_grid <- gridsearch(hlme(fixed = gpa~1+time+I(time^2),# random = ~ 1 + time,# subject = "altid",# data = data.frame(df1),# ng = 7,# mixture = ~1+time+I(time^2)),# rep=100,# maxiter=30,# minit=model1_v1)# ## # # once all models have been tested, remove all non-model output (e.g., anything not labeled model# or model#_grid) and save as an RData file# # # this will prevent having to re-run the models, and since there is an element of randomness in the process, will preserve the output# ## # rm(df, df1, rp)# # save.image(file = "uniscreen_output.RData")
Fit Indices
Code
# choose the best model ---------------------------------------------------# choose the best model by comparing the fit indices and other output# npm = number of parameters - This corresponds to the additional class-specific# parameters: the proportion of the class, the two Weibull parameters, and the three fixed# effects for the quadratic trajectory (intercept, time and time squared).# AIC, BIC, SABIC# entropy# %class - identification of spurious groups# The IC statistics take both model fit and complexity into consideration. Lower values indicate better trade-off between model fit and complexity.# However, differences in the penalty functions (e.g., penalizing model complexity) of different model selection indices result in inconsistent class solutions (i.e., different indices may favor different class solutions). Previous research on mixture model estimation also suggests sample size plays a role in penalty calculations. First, as N approaches infinity, penalty / N should become closer to zero. Second, as N approaches infinity, log(N) / pen-alty should become closer to zero...Note that the CAIC, BIC, and SABIC fulfill these two conditions.# AIC aims at finding the model that minimizes the Kullback–Leibler (K-L) criterion, selecting an approximate model, and providing better predictions of the population parameters# On the contrary, BIC targets the “true” underlying model with the highest posterior probability# Chen, Q., Luo, W., Palardy, G. J., Glaman, R., & McEnturff, A. (2017). The efficacy of common fit indices for enumerating classes in growth mixture models when nested data structure is ignored: A Monte Carlo study. Sage Open, 7(1), 2158244017700459.# The posterior classification can be used to assess the goodness-of-fit of the model (for the# selection of the number of latent classes for instance) and the discrimination of the latent# classes. Many indicators can be derived from it (Proust-Limaet al.2014). The package# lcmm provides two indicators in the function postprob# The proportion of subjects classified in each latent class with a posterior probability# above 0.7, 0.8 and 0.9. This indicates the proportion of subjects not ambiguously# classified in each latent class.# The posterior classification table as defined in Table 3 which computes the mean of the# posterior probabilities of belonging to the latent class among the subjects classified# a posteriori in each latent class. A perfect classification would provide ones in the # diagonal and zeros elsewhere. In practice, high diagonal terms indicate a good# discrimination of the population.tab <-summarytable( model1, model2_grid, model3_grid, model4_grid, model5_grid, model6_grid, model7_grid,which =c("G", "loglik", "conv", "npm", "AIC", "BIC", "SABIC","entropy", "%class"),display = F)brks_a <-quantile(tab[,5], probs =seq(.05, .95, .05), na.rm =TRUE)clrs_a <-round(seq(240, 40, length.out =length(brks_a) +1), 0) %>% {paste0("rgb(255,", ., ",", ., ")")}brks_b <-quantile(tab[,6], probs =seq(.05, .95, .05), na.rm =TRUE)clrs_b <-round(seq(240, 40, length.out =length(brks_a) +1), 0) %>% {paste0("rgb(255,", ., ",", ., ")")}brks_s <-quantile(tab[,7], probs =seq(.05, .95, .05), na.rm =TRUE)clrs_s <-round(seq(240, 40, length.out =length(brks_a) +1), 0) %>% {paste0("rgb(255,", ., ",", ., ")")}# lighter shade = lower value = better modeldatatable(tab) %>%formatRound(columns =c(1:7), digits =0) %>%formatRound(columns =c(8:ncol(tab)), digits =2) %>%formatStyle("AIC", backgroundColor =styleInterval(brks_a, clrs_a)) %>%formatStyle("BIC", backgroundColor =styleInterval(brks_b, clrs_b))%>%formatStyle("SABIC", backgroundColor =styleInterval(brks_s, clrs_s))
# weights: 40 (28 variable)
initial value 1380.897729
iter 10 value 1071.744419
iter 20 value 1045.337396
iter 30 value 1045.079659
final value 1045.079129
converged
Code
output <-summary(test)# coefficient = log odds of being in classif moving from reference group/baseline to indicated group1 z <- output$coefficients^2/ output$standard.errors^2p <- (1-pnorm(abs(z), 0, 1)) *2# build tidy output per classclass_names <-rownames(output$coefficients)col_names <-rep(c("OR", "p"), length(class_names))tableout <-do.call(cbind, lapply(seq_along(class_names), function(i) { df <-data.frame(OR =round(exp(output$coefficients[i, ]), 3),p =round(p[i, ], 3) )colnames(df) <-paste0(class_names[i], c("_OR", "_p")) df}))kable(tableout, format ="html", digits =3, col.names = col_names) |>kable_styling(bootstrap_options =c("striped", "condensed"),full_width =TRUE, font_size =12) |>add_header_above(c(" "=1,"Consistently High Performers"=2,"Sharply Improving"=2,"Starting to Struggle"=2,"Consistently Struggling & At-Risk"=2)) |>column_spec(1, bold =TRUE)
# weights: 95 (72 variable)
initial value 1380.897729
iter 10 value 1050.152500
iter 20 value 1020.295562
iter 30 value 1005.666542
iter 40 value 999.294082
iter 50 value 998.715482
iter 60 value 998.676113
iter 70 value 998.671555
final value 998.671385
converged
Code
output <-summary(test2)# coefficient = log odds of being in classif moving from reference group/baseline to indicated group1 z <- output$coefficients^2/ output$standard.errors^2p <- (1-pnorm(abs(z), 0, 1)) *2# build tidy output per classclass_names <-rownames(output$coefficients)col_names <-rep(c("OR", "p"), length(class_names))tableout <-do.call(cbind, lapply(seq_along(class_names), function(i) { df <-data.frame(OR =round(exp(output$coefficients[i, ]), 3),p =round(p[i, ], 3) )colnames(df) <-paste0(class_names[i], c("_OR", "_p")) df}))kable(tableout, format ="html", digits =3, col.names = col_names) |>kable_styling(bootstrap_options =c("striped", "condensed"),full_width =TRUE, font_size =12) |>add_header_above(c(" "=1,"Consistently High Performers"=2,"Sharply Improving"=2,"Starting to Struggle"=2,"Consistently Struggling & At-Risk"=2)) |>column_spec(1, bold =TRUE)
# weights: 140 (108 variable)
initial value 1380.897729
iter 10 value 1018.835981
iter 20 value 992.378700
iter 30 value 969.965385
iter 40 value 962.225970
iter 50 value 959.980901
iter 60 value 959.632663
iter 70 value 959.599424
iter 80 value 959.593535
iter 90 value 959.592953
final value 959.592931
converged
Code
output <-summary(test3)# coefficient = log odds of being in classif moving from reference group/baseline to indicated group1 z <- output$coefficients^2/ output$standard.errors^2p <- (1-pnorm(abs(z), 0, 1)) *2# build tidy output per classclass_names <-rownames(output$coefficients)col_names <-rep(c("OR", "p"), length(class_names))tableout <-do.call(cbind, lapply(seq_along(class_names), function(i) { df <-data.frame(OR =round(exp(output$coefficients[i, ]), 3),p =round(p[i, ], 3) )colnames(df) <-paste0(class_names[i], c("_OR", "_p")) df}))kable(tableout, format ="html", digits =3, col.names = col_names) |>kable_styling(bootstrap_options =c("striped", "condensed"),full_width =TRUE, font_size =12) |>add_header_above(c(" "=1,"Consistently High Performers"=2,"Sharply Improving"=2,"Starting to Struggle"=2,"Consistently Struggling & At-Risk"=2)) |>column_spec(1, bold =TRUE)
# weights: 185 (144 variable)
initial value 1380.897729
iter 10 value 1026.161838
iter 20 value 992.101426
iter 30 value 953.358682
iter 40 value 946.018296
iter 50 value 941.495887
iter 60 value 940.305037
iter 70 value 940.080322
iter 80 value 940.049116
iter 90 value 940.042298
iter 100 value 940.041573
final value 940.041573
stopped after 100 iterations
Code
output <-summary(test4)# coefficient = log odds of being in classif moving from reference group/baseline to indicated group1 z <- output$coefficients^2/ output$standard.errors^2p <- (1-pnorm(abs(z), 0, 1)) *2# build tidy output per classclass_names <-rownames(output$coefficients)col_names <-rep(c("OR", "p"), length(class_names))tableout <-do.call(cbind, lapply(seq_along(class_names), function(i) { df <-data.frame(OR =round(exp(output$coefficients[i, ]), 3),p =round(p[i, ], 3) )colnames(df) <-paste0(class_names[i], c("_OR", "_p")) df}))kable(tableout, format ="html", digits =3, col.names = col_names) |>kable_styling(bootstrap_options =c("striped", "condensed"),full_width =TRUE, font_size =12) |>add_header_above(c(" "=1,"Consistently High Performers"=2,"Sharply Improving"=2,"Starting to Struggle"=2,"Consistently Struggling & At-Risk"=2)) |>column_spec(1, bold =TRUE)