Load Libraries

library(tidyr)
library(tidyverse)
# library(devtools)
# devtools::install_github("hlennon/LCTMtools")
library(LCTMtools)
library(lcmm)
library(ggplot2)
library(DT)

# LCTMtools is using an old version of dplyr, will create error message when using with updated package

Load Data

df <- read.csv(file="for_trajectory_analysis_updated 5-16.csv", header = T)
df$AnonymousID <- as.factor(df$AnonymousID)
df$altid <- as.numeric(df$AnonymousID)
df2 <- subset(df, select=c(altid, id, courseid))

# Define the relevant course IDs
courseid_40A = "2 40A Post"
courseid_40B = "3 40B Pre"

# Create the new courseid with averaged or existing scores
df2_new <- df2 %>%
  group_by(altid) %>%
  mutate(id = ifelse(courseid %in% c(courseid_40A, courseid_40B), 
                     mean(id[courseid %in% c(courseid_40A, courseid_40B)], na.rm = TRUE), id),
         courseid = ifelse(courseid %in% c(courseid_40A, courseid_40B), "2.5 40A40B", courseid)) %>%
  ungroup() %>%
  distinct()  # Remove duplicates

# Define the relevant course IDs
courseid_40B = "4 40B Post"
courseid_40C = "5 40C Pre"

# Create the new courseid with averaged or existing scores
df2_new2 <- df2_new %>%
  group_by(altid) %>%
  mutate(id = ifelse(courseid %in% c(courseid_40B, courseid_40C), 
                     mean(id[courseid %in% c(courseid_40B, courseid_40C)], na.rm = TRUE), id),
         courseid = ifelse(courseid %in% c(courseid_40B, courseid_40C), "4.5 40B40C", courseid)) %>%
  ungroup() %>%
  distinct()  # Remove duplicates

# Define the relevant course IDs
courseid_40B = "2.5 40A40B"
courseid_40C = "4.5 40B40C"

# Create the new courseid with averaged or existing scores
df2_new3 <- df2_new2 %>%
  group_by(altid) %>%
  mutate(id = ifelse(courseid %in% c(courseid_40B, courseid_40C), 
                     mean(id[courseid %in% c(courseid_40B, courseid_40C)], na.rm = TRUE), id),
         courseid = ifelse(courseid %in% c(courseid_40B, courseid_40C), "3 mid", courseid)) %>%
  ungroup() %>%
  distinct()  # Remove duplicates

df3 <- df2_new3 %>%
  group_by(altid, courseid) %>%
  summarise(id = mean(id), .groups = 'drop')

df3 %>%
  summarise(unique_altid_count = n_distinct(altid))
## # A tibble: 1 x 1
##   unique_altid_count
##                <int>
## 1               1323
load(file = "LCTM_outputv3.RData")

Screen Outliers

Code to standardize scores and remove participants with more than one score +/- |3|. Currently no outliers are removed.

df4 <- cbind.data.frame(df3$altid, df3$courseid, df3$id, scale(df3$id, center=T, scale=T))
colnames(df4)[1:3] <- colnames(df3)
colnames(df4)[4] <- "std_id"

data_wide <- subset(df4, select=-c(3)) %>%
  pivot_wider(names_from = courseid, values_from = std_id, values_fn = mean)

data_wide2 <- na.omit(data_wide)

data_wide$outs <- rowSums(data_wide[2:4] < -3, na.rm = T) + rowSums(data_wide[2:4] > 3, na.rm = T)
table(data_wide$outs)
## 
##    0    1 
## 1314    9
# data_wide <- subset(data_wide, outs == 0, select=-c(outs))

df5 <- subset(df4, df4$altid %in% data_wide2$altid)

Visualize Data

boxplot(df4$id ~ df4$courseid, ylab="Course" , xlab="Identity")

boxplot(df5$id ~ df5$courseid, ylab="Course" , xlab="Identity")

Recode Variables

Make numeric version of course ID.

df5$courseid <- as.factor(df5$courseid)
df5$courseidnum <- as.numeric(df5$courseid)

Run LCTM

Determine fixed or random effects

Residual profile can be approximated by a flat line, so a random intercept, slope, or quadratic term should be considered instead of a fixed intercept/slope/term

Loading from RData file – code is currently commented out.

# model1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                random = ~-1,
#                subject = "altid",
#                ng = 1,
#                nwg = FALSE,
#                data = data.frame(df5))

rp <- residualplot_step1(model1,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = df5,
                          ylimit=c(-3,3))

## "p1"

Determine number of classes

Test the models with different numbers of classes.

Initial values are specified using gridsearch - hlme is run for 30 courseidnums from 100 random vectors, and the estimation procedure is finalized for the initial value that produced the best log-likelihood.

Loading from RData file – code is currently commented out.

# set.seed(100)
# model1_v1 <- hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                   random = ~ 1 + courseidnum,
#                   subject = "altid",
#                   ng = 1,
#                   nwg = FALSE,
#                   data = data.frame(df5))
# 
# model2_grid <- gridsearch(hlme(fixed = id ~ 1 + courseidnum + I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 2,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model3_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 3,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model4_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 4,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model5_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 5,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model6_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 6,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model7_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 7,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model8_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 8,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)
# 
# model9_grid <- gridsearch(hlme(fixed = id~1+courseidnum+I(courseidnum^2),
#                                random = ~ 1 + courseidnum,
#                                subject = "altid",
#                                data = data.frame(df5),
#                                ng = 9,
#                                mixture = ~1+courseidnum+I(courseidnum^2)),
#                           rep=100,
#                           maxiter=30,
#                           minit=model1_v1)

Once all models have been tested, remove all non-model output (e.g., anything not labeled model# or model#_grid) and save as an RData file. This will prevent having to re-run the models, and since there is an element of randomness in the process, will preserve the output

# rm(df2, df3, rp, data_wide, out_data)
# save.image(file = "LCTM_outputv3.RData")

Choose the best model

Choose the best model by comparing the fit indices and other output

  • npm = number of parameters - This corresponds to the additional class-specific parameters: the proportion of the class, the two Weibull parameters, and the three fixed effects for the quadratic trajectory (intercept, time and time squared).
  • AIC, BIC, SABIC
  • entropy
  • %class - identification of spurious groups

“The IC statistics take both model fit and complexity into consideration. Lower values indicate better trade-off between model fit and complexity. However, differences in the penalty functions (e.g., penalizing model complexity) of different model selection indices result in inconsistent class solutions (i.e., different indices may favor different class solutions). Previous research on mixture model estimation also suggests sample size plays a role in penalty calculations. First, as N approaches infinity, penalty / N should become closer to zero. Second, as N approaches infinity, log(N) / pen-alty should become closer to zero…Note that the CAIC, BIC, and SABIC fulfill these two conditions. AIC aims at finding the model that minimizes the Kullback–Leibler (K-L) criterion, selecting an approximate model, and providing better predictions of the population parameters On the contrary, BIC targets the “true” underlying model with the highest posterior probability.”

Chen, Q., Luo, W., Palardy, G. J., Glaman, R., & McEnturff, A. (2017). The efficacy of common fit indices for enumerating classes in growth mixture models when nested data structure is ignored: A Monte Carlo study. Sage Open, 7(1), 2158244017700459.

Create Table for Fit Statistics

Two- and three-trajectory models have the best fit.

tab <- summarytable(
  model1, model2_grid, model3_grid, model4_grid,
  model5_grid, model6_grid, model7_grid,
  model8_grid, model9_grid,
  which = c("G", "loglik", "conv", "npm", "AIC", "BIC", "SABIC",
            "entropy", "%class"))
##             G    loglik conv npm      AIC      BIC    SABIC   entropy
## model1      1 -263.8827    1   4 535.7654 545.8958 533.2687 1.0000000
## model2_grid 2 -187.2404    1  11 396.4808 424.3394 389.6149 0.8665772
## model3_grid 3 -181.4809    1  15 392.9619 430.9509 383.5992 0.8964830
## model4_grid 4 -180.2929    1  19 398.5858 446.7052 386.7265 0.7932849
## model5_grid 5 -172.1333    1  23 390.2667 448.5165 375.9106 0.7941472
## model6_grid 6 -168.9788    1  27 391.9577 460.3379 375.1049 0.8272695
## model7_grid 7 -166.0604    1  31 394.1209 472.6315 374.7714 0.8465209
## model8_grid 8 -153.4963    1  35 376.9926 465.6336 355.1464 0.9154080
## model9_grid 9 -150.9043    1  39 379.8085 478.5799 355.4656 0.8994757
##                %class1   %class2   %class3   %class4   %class5   %class6
## model1      100.000000                                                  
## model2_grid  90.322581  9.677419                                        
## model3_grid  87.096774  3.225806  9.677419                              
## model4_grid  66.666667 20.430108  2.150538 10.752688                    
## model5_grid   1.075269 10.752688  2.150538 67.741935 18.279570          
## model6_grid  27.956989 11.827957 39.784946 16.129032  2.150538  2.150538
## model7_grid   3.225806 33.333333 13.978495  2.150538 13.978495 16.129032
## model8_grid  11.827957  2.150538 13.978495 30.107527 18.279570 17.204301
## model9_grid   3.225806 26.881720  4.301075 17.204301 17.204301  3.225806
##               %class7   %class8  %class9
## model1                                  
## model2_grid                             
## model3_grid                             
## model4_grid                             
## model5_grid                             
## model6_grid                             
## model7_grid 17.204301                   
## model8_grid  3.225806  3.225806         
## model9_grid  2.150538 13.978495 11.82796
brks_a <- quantile(tab[,5], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_a <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

brks_b <- quantile(tab[,6], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_b <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

brks_s <- quantile(tab[,7], probs = seq(.05, .95, .05), na.rm = TRUE)
clrs_s <- round(seq(240, 40, length.out = length(brks_a) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

# lighter shade = lower value = better model
datatable(tab) %>%
  formatRound(columns = c(1:7), digits = 0) %>%
  formatRound(columns = c(8:ncol(tab)), digits = 2) %>%
  formatStyle("AIC", backgroundColor = styleInterval(brks_a, clrs_a)) %>%
  formatStyle("BIC", backgroundColor = styleInterval(brks_b, clrs_b))%>%
  formatStyle("SABIC", backgroundColor = styleInterval(brks_s, clrs_s))

Check Posterior Probability

“The posterior classification can be used to assess the goodness-of-fit of the model (for the selection of the number of latent classes for instance) and the discrimination of the latent classes. Many indicators can be derived from it (Proust-Limaet al.2014). The package lcmm provides two indicators in the function postprob

The proportion of subjects classified in each latent class with a posterior probability above 0.7, 0.8 and 0.9. This indicates the proportion of subjects not ambiguously classified in each latent class.

The posterior classification table as defined in Table 3 which computes the mean of the posterior probabilities of belonging to the latent class among the subjects classified a posteriori in each latent class. A perfect classification would provide ones in the diagonal and zeros elsewhere. In practice, high diagonal terms indicate a good discrimination of the population.”

Proust-Lima, C., Philipps, V., & Liquet, B. (2015). Estimation of extended mixed models using latent classes and latent processes: the R package lcmm. arXiv preprint arXiv:1503.00890.

postprob(model8_grid)
##  
## Posterior classification: 
##   class1 class2 class3 class4 class5 class6 class7 class8
## N  11.00   2.00  13.00  28.00  17.00   16.0   3.00   3.00
## %  11.83   2.15  13.98  30.11  18.28   17.2   3.23   3.23
##  
## Posterior classification table: 
##      --> mean of posterior probabilities in each class 
##         prob1 prob2  prob3  prob4  prob5  prob6  prob7  prob8
## class1 0.8901     0 0.0000 0.0930 0.0141 0.0028 0.0000 0.0000
## class2 0.0000     1 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## class3 0.0000     0 0.9769 0.0000 0.0228 0.0000 0.0003 0.0000
## class4 0.0194     0 0.0000 0.9525 0.0152 0.0106 0.0000 0.0023
## class5 0.0015     0 0.0183 0.0819 0.8981 0.0000 0.0002 0.0000
## class6 0.0287     0 0.0000 0.0207 0.0000 0.9507 0.0000 0.0000
## class7 0.0000     0 0.0342 0.0000 0.0177 0.0000 0.9480 0.0000
## class8 0.0000     0 0.0000 0.0016 0.0000 0.0000 0.0000 0.9984
##  
## Posterior probabilities above a threshold (%): 
##          class1 class2 class3 class4 class5 class6 class7 class8
## prob>0.7  81.82    100    100  96.43  88.24  93.75 100.00    100
## prob>0.8  81.82    100    100  96.43  88.24  87.50 100.00    100
## prob>0.9  63.64    100    100  89.29  76.47  87.50  66.67    100
## 
postprob(model5_grid)
##  
## Posterior classification: 
##   class1 class2 class3 class4 class5
## N   1.00  10.00   2.00  63.00  17.00
## %   1.08  10.75   2.15  67.74  18.28
##  
## Posterior classification table: 
##      --> mean of posterior probabilities in each class 
##        prob1  prob2  prob3  prob4  prob5
## class1     1 0.0000 0.0000 0.0000 0.0000
## class2     0 0.8581 0.0000 0.1061 0.0358
## class3     0 0.0000 0.9518 0.0000 0.0482
## class4     0 0.0299 0.0000 0.8649 0.1051
## class5     0 0.0139 0.0193 0.1058 0.8610
##  
## Posterior probabilities above a threshold (%): 
##          class1 class2 class3 class4 class5
## prob>0.7    100     80    100  79.37  94.12
## prob>0.8    100     70    100  73.02  76.47
## prob>0.9    100     70    100  57.14  41.18
## 
postprob(model2_grid)
##  
## Posterior classification: 
##   class1 class2
## N  84.00   9.00
## %  90.32   9.68
##  
## Posterior classification table: 
##      --> mean of posterior probabilities in each class 
##         prob1  prob2
## class1 0.9800 0.0200
## class2 0.1358 0.8642
##  
## Posterior probabilities above a threshold (%): 
##          class1 class2
## prob>0.7  98.81  88.89
## prob>0.8  98.81  88.89
## prob>0.9  91.67  55.56
## 

Re-Test Effects Structure & Terms

residualplot_step1_v2 <- function(model, nameofoutcome, nameofage, data, ylimit = c(-50, 50)) {
    require(dplyr)
    k <- model$ng
    preds <- model$pred
    names(preds)[6] <- nameofoutcome
    nameofid <- names(model$pred)[1]
    test <- dplyr::left_join(preds, model$pprob, .by = nameofid)
    test <- dplyr::left_join(test, data, .by = c(nameofid, nameofoutcome))
    plotvalues <- NULL
    for (i in 1:k) {
        newplotvalues <- test %>% filter(class == i) %>% mutate(Residuals = get(nameofoutcome) - 
            eval(parse(text = paste0("pred_ss", i))))
        plotvalues <- rbind(plotvalues, newplotvalues)
        plotvaluessub <- plotvalues %>% filter(class == i)
        pname <- paste0("p", i)
        assign(pname, ggplot2::ggplot(data = plotvaluessub, aes(x = get(nameofage), 
            y = Residuals, group = class)) + theme(axis.text = element_text(size = 16), 
            text = element_text(size = 16)) + geom_point() + 
            stat_summary(fun = mean, geom = "line", size = 3, 
                col = "CadetBlue", group = 1) + ggtitle(paste("Residuals in class", i)) + ylim(ylimit))
        print(eval(parse(text = (paste0("p", i)))))
        plotname <- paste0("p", i, ".png")
        ggplot2::ggsave(filename = plotname)
    }
    return(as.list(mget(paste0("p", 1:k))))
}

rp2 <- residualplot_step1_v2(model8_grid,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = df5,
                          ylimit=c(-3,3))

rp2 <- residualplot_step1_v2(model7_grid,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = df5,
                          ylimit=c(-3,3))

rp2 <- residualplot_step1_v2(model2_grid,
                          nameofoutcome="id",  nameofage = "courseid",
                          data = df5,
                          ylimit=c(-3,3))

Visualize Solutions

Two-Trajectory Model

classes_2fac <- as.data.frame(model2_grid$pprob[,1:2])

df_2fac <- merge(df4, classes_2fac, by = "altid")
table(classes_2fac$class)
## 
##  1  2 
## 84  9
df_2fac$classn[df_2fac$class == "1"] <- "Class 1, n = 84"
df_2fac$classn[df_2fac$class == "2"] <- "Class 2, n = 9"

df_2fac$classn <- as.factor(df_2fac$classn)

ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_2fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))

Five-Trajectory Model

classes_5fac <- as.data.frame(model5_grid$pprob[,1:2])

df_5fac <- merge(df4, classes_5fac, by = "altid")
table(classes_5fac$class)
## 
##  1  2  3  4  5 
##  1 10  2 63 17
df_5fac$classn[df_5fac$class == "1"] <- "Class 1, n = 1"
df_5fac$classn[df_5fac$class == "2"] <- "Class 2, n = 10"
df_5fac$classn[df_5fac$class == "3"] <- "Class 3, n = 2"
df_5fac$classn[df_5fac$class == "4"] <- "Class 4, n = 63"
df_5fac$classn[df_5fac$class == "5"] <- "Class 5, n = 17"

df_5fac$classn <- as.factor(df_5fac$classn)

ggplot(df_5fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_5fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))

Six-Trajectory Model

classes_6fac <- as.data.frame(model6_grid$pprob[,1:2])

df_6fac <- merge(df4, classes_6fac, by = "altid")
table(classes_6fac$class)
## 
##  1  2  3  4  5  6 
## 26 11 37 15  2  2
df_6fac$classn[df_6fac$class == "1"] <- "Class 1, n = 26"
df_6fac$classn[df_6fac$class == "2"] <- "Class 2, n = 11"
df_6fac$classn[df_6fac$class == "3"] <- "Class 3, n = 37"
df_6fac$classn[df_6fac$class == "4"] <- "Class 4, n = 15"
df_6fac$classn[df_6fac$class == "5"] <- "Class 5, n = 2"
df_6fac$classn[df_6fac$class == "6"] <- "Class 6, n = 2"

df_6fac$classn <- as.factor(df_6fac$classn)

ggplot(df_6fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_6fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))

Seven-Trajectory Model

classes_7fac <- as.data.frame(model7_grid$pprob[,1:2])

df_7fac <- merge(df4, classes_7fac, by = "altid")
table(classes_7fac$class)
## 
##  1  2  3  4  5  6  7 
##  3 31 13  2 13 15 16
df_7fac$classn[df_7fac$class == "1"] <- "Class 1, n = 3"
df_7fac$classn[df_7fac$class == "2"] <- "Class 2, n = 31"
df_7fac$classn[df_7fac$class == "3"] <- "Class 3, n = 13"
df_7fac$classn[df_7fac$class == "4"] <- "Class 4, n = 2"
df_7fac$classn[df_7fac$class == "5"] <- "Class 5, n = 13"
df_7fac$classn[df_7fac$class == "6"] <- "Class 6, n = 15"
df_7fac$classn[df_7fac$class == "7"] <- "Class 7, n = 16"

df_7fac$classn <- as.factor(df_7fac$classn)

ggplot(df_7fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_7fac, aes(x=courseid, y=id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))

Eight-Trajectory Model

classes_8fac <- as.data.frame(model8_grid$pprob[,1:2])

df_8fac <- merge(df4, classes_8fac, by = "altid")
table(classes_8fac$class)
## 
##  1  2  3  4  5  6  7  8 
## 11  2 13 28 17 16  3  3
df_8fac$classn[df_8fac$class == "1"] <- "Class 1, n = 11"
df_8fac$classn[df_8fac$class == "2"] <- "Class 2, n = 2"
df_8fac$classn[df_8fac$class == "3"] <- "Class 3, n = 13"
df_8fac$classn[df_8fac$class == "4"] <- "Class 4, n = 28"
df_8fac$classn[df_8fac$class == "5"] <- "Class 5, n = 17"
df_8fac$classn[df_8fac$class == "6"] <- "Class 6, n = 16"
df_8fac$classn[df_8fac$class == "7"] <- "Class 7, n = 3"
df_8fac$classn[df_8fac$class == "8"] <- "Class 8, n = 3"

df_8fac$classn <- as.factor(df_8fac$classn)

ggplot(df_8fac, aes(x=courseid, y=std_id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=F, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment")

ggplot(df_8fac, aes(x=courseid, y=std_id, group=altid, color=classn)) +
  geom_line(alpha = .25) +
  geom_smooth(aes(group=class), method="loess", size=2, se=T, span = 1) +
  xlab("Course") + ylab("ID") + labs(color = "Class Assignment") + facet_grid(cols = vars(classn))