1 Load Libraries

library(psych) #for describe()
library(corrplot) #for corrplot()
library(nFactors) #for factor analysis
library(naniar) #for gg_miss_upset()
library(mclust) #for LPA
library(kableExtra) #for tables
library(tidyr) #for graphs
library(ggplot2) #for graphs
library(Rmisc) #for summarySE()
library(reshape2) #for data transformation
library(fmsb) #for spider plot
library(dplyr) #for data transformation

2 Load Data

import <- read.csv(file="finaldata.csv")
# limited to finished and doctoral students
import2 <- subset(import, import$Q71 == 1 & Finished == 1)

d <- subset(import2, select=c(X, grep("Q3.1", colnames(import)),
                                                         grep("Q4.1", colnames(import)),
                                                         grep("Q5.1", colnames(import))))

rm(import, import2)

3 Check Missing Data

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 45)

mcar_test(d[-1])

## # A tibble: 1 × 4
##   statistic    df p.value missing.patterns
##       <dbl> <dbl>   <dbl>            <int>
## 1      544.   514   0.178               14

# test is not significant = data MCAR

4 Create Composites

d$s_rec <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7)/6
d$s_pc <- (d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15)/5
d$s_int <- (d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10)/3
d$e_rec <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7)/5
d$e_pc <- (d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10)/5
d$e_int <- (d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/3
d$r_rec <- (d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7)/6
d$r_pc <- (d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15)/5
d$r_int <- (d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/4

5 Correlation Matrix

corrout <- corr.test(subset(d, select=c(47:55)))
corrplot.mixed(corrout$r)

6 Factor Analysis

df <- na.omit(subset(d, select=c(47:55)))

ev <- eigen(cor(df)) # get eigenvalues
ap <- parallel(subject=nrow(df),var=ncol(df),rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

EFA <- factanal(df, factors = 2, rotation = "promax", cutoff = 0.3)
print(EFA, digits=3, cutoff=.3, sort=TRUE)

## 
## Call:
## factanal(x = df, factors = 2, rotation = "promax", cutoff = 0.3)
## 
## Uniquenesses:
## s_rec  s_pc s_int e_rec  e_pc e_int r_rec  r_pc r_int 
## 0.658 0.534 0.668 0.458 0.311 0.294 0.460 0.390 0.509 
## 
## Loadings:
##       Factor1 Factor2
## s_rec  0.626         
## s_pc   0.596         
## s_int  0.569         
## r_rec  0.754         
## r_pc   0.753         
## r_int  0.711         
## e_rec          0.736 
## e_pc           0.787 
## e_int          0.863 
## 
##                Factor1 Factor2
## SS loadings      2.726   1.964
## Proportion Var   0.303   0.218
## Cumulative Var   0.303   0.521
## 
## Factor Correlations:
##         Factor1 Factor2
## Factor1   1.000   0.391
## Factor2   0.391   1.000
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 742.57 on 19 degrees of freedom.
## The p-value is 3.38e-145

7 Recalculate Composites

d$sr_rec <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7 +
               d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7)/12
d$sr_pc <- (d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15 +
              d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15)/10
d$sr_int <- (d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10 +
               d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/7
d$e_rec <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7)/5
d$e_pc <- (d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10)/5
d$e_int <- (d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/3

d$s <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7 +
        d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15 +
        d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10)/14

d$r <- (d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7 +
        d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15 +
        d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/15

d$e <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7 +
        d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10 +
        d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/17

8 Check for Outliers

d1 <- na.omit(subset(d, select=c(1, sr_rec, sr_pc, sr_int, e_rec, e_pc, e_int)))

m_dist <- mahalanobis(d1[-c(1)], colMeans(d1[-c(1)]), cov(d1[-c(1)]))
d1$MD <- round(m_dist, 1)
plot(d1$MD)
describe(m_dist)

##    vars    n mean   sd median trimmed mad  min   max range skew kurtosis   se
## X1    1 1160 5.99 5.73   3.95    4.84 2.7 0.23 38.64 38.41 2.43     7.05 0.17

cut <- qchisq(.999, df=(ncol(d1)-1))
abline(a=cut, b=0, col="red")

d1$outlier <- F
d1$outlier[d1$MD > cut] <- T
table(d1$outlier)

## 
## FALSE  TRUE 
##  1133    27

d1 <- subset(d1, outlier == F)

d2 <- subset(d, X %in% d1$X)

9 Create Standardized Variables

d2$sr_rec_std <- as.vector(scale(d2$sr_rec, center=T, scale=T))
d2$sr_pc_std <- as.vector(scale(d2$sr_pc, center=T, scale=T))
d2$sr_int_std <- as.vector(scale(d2$sr_int, center=T, scale=T))
d2$e_rec_std <- as.vector(scale(d2$e_rec, center=T, scale=T))
d2$e_pc_std <- as.vector(scale(d2$e_pc, center=T, scale=T))
d2$e_int_std <- as.vector(scale(d2$e_int, center=T, scale=T))

d2$s_std <- as.vector(scale(d2$s, center=T, scale=T))
d2$r_std <- as.vector(scale(d2$r, center=T, scale=T))
d2$e_std <- as.vector(scale(d2$e, center=T, scale=T))

desc <- describe(subset(d2, select=c(s, r, e, s_std, r_std, e_std)))
desc

##       vars    n mean   sd median trimmed  mad   min  max range  skew kurtosis
## s        1 1133 4.06 0.60   4.07    4.10 0.64  1.00 5.00  4.00 -0.60     0.49
## r        2 1133 4.26 0.61   4.33    4.32 0.69  1.13 5.00  3.87 -0.73     0.28
## e        3 1133 3.20 0.52   3.24    3.25 0.61  1.00 3.82  2.82 -0.77     0.40
## s_std    4 1133 0.00 1.00   0.01    0.06 1.06 -5.11 1.56  6.67 -0.60     0.49
## r_std    5 1133 0.00 1.00   0.12    0.10 1.13 -5.13 1.21  6.34 -0.73     0.28
## e_std    6 1133 0.00 1.00   0.06    0.10 1.18 -4.27 1.20  5.47 -0.77     0.40
##         se
## s     0.02
## r     0.02
## e     0.02
## s_std 0.03
## r_std 0.03
## e_std 0.03

10 Create Final Dataframe for Second LPA

load(file="LPAv3.RData")
d_std <- subset(d2, select=c(X, 68:70))
d_raw <- subset(d2, select=c(X, 59:61))

11 Sci, Res, & Eng LPA

11.1 Entropy Plot

#entropy plot - elbow indicates number of classes that should be considered
d <- subset(d_std, select=-c(1))
# output <- clustCombi(data = d)
output2 <- clustCombiOptim(output, plot=TRUE)

summary(output)

## ----------------------------------------------------
## Combining Gaussian mixture components for clustering 
## ----------------------------------------------------
## 
## Mclust model name: VII 
## Number of components: 8 
## 
## Combining steps:
## 
##   Step | Classes combined at this step | Class labels after this step
## -------|-------------------------------|-----------------------------
##    0   |              ---              | 1 2 3 4 5 6 7 8 
##    1   |             1 & 6             | 1 2 3 4 5 7 8 
##    2   |             1 & 5             | 1 2 3 4 7 8 
##    3   |             1 & 3             | 1 2 4 7 8 
##    4   |             1 & 2             | 1 4 7 8 
##    5   |             1 & 7             | 1 4 8 
##    6   |             1 & 4             | 1 8 
##    7   |             1 & 8             | 1

entPlot(output$MclustOutput$z, output$combiM, abc = c("normalized"))

entPlot(output$MclustOutput$z, output$combiM, abc = c("standard"))

11.2 Calculate LRT Scores

#calculate and plot LRT scores
#BLRT - bootstrap likelihood ratio test - compares same model with diff number of clases, sig indicates that model with more classes fits better than one with fewer - should be sig
# lrt <- mclustBootstrapLRT(d, mclust.options("emModelNames"))
print(lrt)

## ------------------------------------------------------------- 
## Bootstrap sequential LRT for the number of mixture components 
## ------------------------------------------------------------- 
## Model        = EII 
## Replications = 999 
##                  LRTS bootstrap p-value
## 1 vs 2   4.504357e+02             0.001
## 2 vs 3   7.954769e+01             0.001
## 3 vs 4   1.784418e+02             0.001
## 4 vs 5   7.649047e+01             0.001
## 5 vs 6   5.237173e+01             0.001
## 6 vs 7   5.490722e+01             0.001
## 7 vs 8   4.971794e+01             0.001
## 8 vs 9   1.254765e-03             0.803

11.3 Calculate and Plot BIC Scores

#calculate and plot BIC scores
#sample adjusted BIC - in the literature, lowest BIC equals best fitting model (specific to Mplus), in R higher BIC is better
# BIC <- mclustBIC(d, G = 1:9, mclust.options("emModelNames"))
plot.mclustBIC(BIC)

11.4 Calculate and Plot ICL Scores

#calculate ICL scores
#BIC penalized by estimated mean entropy, not great for datasets with overlapping, higher BIC is better
# icl <- mclustICL(d, G = 1:9, mclust.options("emModelNames"))
plot.mclustICL(icl)

11.5 Create Fit Indices Table

# # calculate stuff for table
# n <- 1
# p <- 1
# q <- 1
# for (i in 1:9) {
#   #run model
#   varname <- paste("m",n,sep="")
#   m <- Mclust(d, G = n, mclust.options("emModelNames"))
#   assign(varname,m)
#   #extract ll
#   varname3 <- paste("logl",n,sep="")
#   ll <- eval(parse(text=paste("m",n,"$loglik",sep = "")))
#   assign(varname3,ll)
#   #calc free parameters
#   #dimension of the data = number of original variables in model
#   #EII 1
#   #VII G
#   #EEI p
#   #free covariance parameters, VEI p + G - 1
#   #EVI pG - G + 1
#   #VVI pG
#   #EEE p(p + 1)/2
#   #free covariance parameters, EEV Gp(p + 1)/2 - (G - 1)p
#   #free covariance parameters, VEV Gp(p + 1)/2 - (G - 1)(p - 1)
#   #VVV Gp(p + 1)/2
#   #EVE p(p + 1)/2 + (G - 1)(p - 1)
#   #VVE p(p + 1)/2 + (G - 1)p
#   #VEE p(p + 1)/2 + (G - 1)
#   #EVV Gp(p + 1)/2 - (G - 1)
#   if (m$modelName == "EII") {
#     freepar <- 1
#   } else if (m$modelName == "VII") {
#     freepar <- m$G
#   } else if (m$modelName == "EEI") {
#     freepar <- m$d
#   } else if (m$modelName == "VEI") {
#     freepar <- m$d + m$G - 1
#   } else if (m$modelName == "EVI") {
#     freepar <- m$d*m$G - m$G + 1
#   } else if (m$modelName == "VVI") {
#     freepar <- m$d*m$G
#   } else if (m$modelName == "EEE") {
#     freepar <- m$d(m$d + 1)/2
#   } else if (m$modelName == "EEV") {
#     freepar <- m$G*m$d*(m$d + 1)/2 - (m$G - 1)*m$d
#   } else if (m$modelName == "VEV") {
#     freepar <- m$G*m2$d*(m$d + 1)/2 - (m$G - 1)*(m$d - 1)
#   } else if (m$modelName == "VVV") {
#     freepar <- m$G*m$d*(m$d + 1)/2
#   } else if (m$modelName == "XXX") {
#     freepar <- "-"
#   } else if (m$modelName == "EVE") {
#     freepar <- m$d*(m$d + 1)/2 + (m$G - 1)*(m$d - 1)
#   } else if (m$modelName == "VVE") {
#     freepar <- m$d*(m$d + 1)/2 + (m$G - 1)*m$d
#   } else if (m$modelName == "VEE") {
#     freepar <- m$d*(m$d + 1)/2 + (m$G - 1)
#   } else if (m$modelName == "EVV") {
#     freepar <- m$G*m$d*(m$d + 1)/2 - (m$G - 1)
#   } else {
#     freepar <- 999
#   }
#   varname4 <- paste("freep",n,sep="")
#   assign(varname4,freepar)
#   #cal post prob
#   m <- eval(parse(text=paste("m",n,sep="")))
#   uncer <- m$uncertainty
#   posterior_prob <- 1 - round(uncer, 5)
#   #cal entropy
#   ent <- round(mean(posterior_prob), 3)
#   varname2 <- paste("ent",n,sep="")
#   assign(varname2, ent)
#   #AIC <- (2 * mcm$df - 2 * mcm$loglik)
#   varname5 <- paste("aic",n,sep="")
#   aic <- (2 * m$df - 2 * m$loglik)
#   assign(varname5,aic)
#   #CAIC <- (((log(mcm$n) + 1) * mcm$df) - 2 * mcm$loglik)
#   varname6 <- paste("caic",n,sep="")
#   caic <- (((log(m$n) + 1) * m$df) - 2 * m$loglik)
#   assign(varname6,caic)
#   #SABIC <- (mcm$df * log((mcm$n + 2)/24) - 2 * mcm$loglik)
#   varname7 <- paste("sabic",n,sep="")
#   sabic <- (m$df * log((m$n + 2)/24) - 2 * m$loglik)
#   assign(varname7, sabic)
#   #iterate
#   n <- n+1
#   p <- p+1
#   #grab group sizes
#   grps <- table(eval(parse(text=paste("m",q,"$classification",sep=""))))
#   tg <- as.list(sort(unique(eval(parse(text=paste("m",q,"$classification",sep=""))))))
#   t <- 1
#   #calculate group percents
#   for (i in tg) {
#     perc <- grps[[t]] / (eval(parse(text=paste("m",i,"$n[1]",sep=""))))
#     varname <- paste("m",q,"perc",t,sep="")
#     assign(varname,perc)
#     t <- t + 1
#   }
#   #combine all group percents into one vector
#   mp <- mapply(get, grep(paste("m",q,"perc",sep=""), ls(), value=T))
#   varname <- paste("allperc",q,sep="")
#   assign(varname,mp)
#   q <- q + 1
# }

#throw stuff together for the table
proflist <- sprintf("Profiles %d", 1:9)
modlist <- rbind(m1$modelName, m2$modelName, m3$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName)
#fix BLRT columns
  blank <- "-"
  lrtobs <- "-"
  lrtobs <- append(lrtobs, round(lrt$obs[1:7], digits = 2))
  lrtobs <- append(lrtobs, blank)
  lrtp <- "-"
  lrtp <- append(lrtp, lrt$p.value[1:7])
  lrtp <- append(lrtp, blank)
bicval <- round(BIC[1:9], digits = 2)
iclval <- round(icl[1:9], digits = 2)
listent <- round(rbind(ent1,ent2,ent3,ent4,ent5,ent6,ent7,ent8,ent9), digits = 2)
listlog <- round(rbind(logl1,logl2,logl3,logl4,logl5,logl6,logl7,logl8,logl9), digits = 2)
listfreep <- rbind(freep1,freep2,freep3,freep4,freep5,freep6,freep7,freep8,freep9)
listaic <- round(rbind(aic1,aic2,aic3,aic4,aic5,aic6,aic7,aic8,aic9), digits = 2)
listcaic <- round(rbind(caic1,caic2,caic3,caic4,caic5,caic6,caic7,caic8,caic9), digits = 2)
listsabic <- round(rbind(sabic1,sabic2,sabic3,sabic4,sabic5,sabic6,sabic7,sabic8,sabic9), digits = 2)
listperc <- rbind(
  sum(allperc1 < .05),
  sum(allperc2 < .05),
  sum(allperc3 < .05),
  sum(allperc4 < .05),
  sum(allperc5 < .05),
  sum(allperc6 < .05),
  sum(allperc7 < .05),
  sum(allperc8 < .05),
  sum(allperc9 < .05))

indicestab <- cbind(modlist,listlog,listfreep,listaic,listcaic,bicval,listsabic,iclval,lrtobs,lrtp,listent,listperc)
colnames(indicestab) <- c("Model","LL","#FP","AIC","CAIC","BIC","SABIC","ICL","BLRTS","BLRTS p","Entropy","<5%")
rownames(indicestab) <- proflist

indicestab %>%
    kbl() %>%
    kable_styling(font_size = 11)

	Model	LL	#FP	AIC	CAIC	BIC	SABIC	ICL	BLRTS	BLRTS p	Entropy	<5%
Profiles 1	XXX	-4519.04		9056.07	9110.37	-9671.07	9072.78	-9671.07			1	0
Profiles 2	VEV	-4285.58	10	8605.16	8707.71	-9248.77	8636.72	-9541	450.44	0.001	0.98	0
Profiles 3	VEV	-4218.65	14	8487.3	8638.12	-9197.35	8533.71	-9535.1	79.55	0.001	0.84	0
Profiles 4	VEV	-4128.84	18	8323.68	8522.75	-9047.04	8384.94	-9529.21	178.44	0.001	0.77	0
Profiles 5	VEV	-4123.64	5	8295.29	8440.07	-8998.68	8339.84	-9456.41	76.49	0.001	0.85	1
Profiles 6	VEV	-4099.49	6	8256.98	8431.93	-8974.44	8310.82	-9407.92	52.37	0.001	0.84	2
Profiles 7	VEV	-4080.04	7	8228.07	8433.18	-8947.66	8291.19	-9448.65	54.91	0.001	0.79	2
Profiles 8	VEV	-4045.21	8	8168.42	8403.69	-8926.07	8240.82	-9508.31	49.72	0.001	0.8	3
Profiles 9	VEV	-4009.91	46	8181.82	8670.46	-8954.2	8332.18	-9648.95			0.76	2

12 Save Current Environment

# save.image(file="LPAv3.RData")

13 Visualize Models (v3)

viz2 <- function(orig){
  mcm <- orig
  print(summary.Mclust(mcm))
  d_std$class <- mcm$classification
  
  df_long <- gather(d_std, key = "variable", value = "value", s_std, r_std, e_std)
  summ <- summarySE(df_long, measurevar="value", groupvars = c("variable","class"), na.rm = T) #the summarySE() command creates a table with the information we need for our plot
  
  # Create a barplot of means for each column by class
  means <- ggplot(summ, aes(x = class, y = value, fill = variable)) +
    geom_bar(stat = "summary", fun = "mean", position = "dodge") +
    geom_errorbar(aes(ymin=value-se, ymax=value+se),
                  width=.2,
                  position=position_dodge(.9))
  print(means)
  
  df_long$class <- as.factor(df_long$class)
  dist <- df_long %>%
    ggplot(aes(x=value, fill=class)) +
    geom_density(alpha=0.4, bw=.20) +
    geom_vline(xintercept=0, size=.5, color="black") +
    facet_wrap(~variable) +
    xlim(-4.5,2)
  print(dist)
  
  vars <- unique(df_long$variable)
  n <- 1
  for (i in 1:length(vars)) {
    print(vars[n])
    print(subset(df_long, variable == vars[n]) %>%
    ggplot(aes(x=value, fill=class)) +
    geom_density(alpha=0.4, bw=.20) +
    geom_vline(xintercept=0, size=.5, color="black") +
    facet_wrap(~class) +
    xlim(-4.5,2))
    n <- n + 1
  }
  
  n <- 1
  for (i in 1:length(unique(df_long$class))) {
    df_long$class_spec <- 0
    df_long$class_spec[d_std$class == n] <- 1
    df_long$class_spec <- as.factor(df_long$class_spec)
    
    scatter <- ggplot(df_long, aes(x = variable, y = value, color = class_spec)) +
      geom_jitter()
    print(paste("Class",n))
    print(scatter)
    n <- n + 1
  }
  
  class <- mcm$classification
  uncer <- mcm$uncertainty
  posterior_prob <- 1 - round(uncer, 5)
  density <- plot(density(posterior_prob))
  print(density)
}

13.1 Two-Class Model

viz2(m2)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 2 components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4285.58 1133 17 -8690.714 -8738.077
## 
## Clustering table:
##    1    2 
## 1001  132

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## NULL

13.2 Three-Class Model

viz2(m3)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 3 components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4218.65 1133 25 -8613.116 -9068.174
## 
## Clustering table:
##   1   2   3 
## 634 374 125

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## NULL

13.3 Four-Class Model

viz2(m4)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 4 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -4128.839 1133 33 -8489.754 -9145.533
## 
## Clustering table:
##   1   2   3   4 
## 247 251 516 119

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## NULL

13.4 Five-Class Model

viz2(m5)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 5 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -4123.645 1133 24 -8416.073 -8812.255
## 
## Clustering table:
##   1   2   3   4   5 
## 226 179 630  65  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## NULL

13.5 Six-Class Model

viz2(m6)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 6 components: 
## 
##  log-likelihood    n df      BIC       ICL
##       -4099.492 1133 29 -8402.93 -8845.879
## 
## Clustering table:
##   1   2   3   4   5   6 
## 231 198 554  78  39  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

## NULL

13.6 Seven-Class Model

viz2(m7)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 7 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -4080.037 1133 34 -8399.183 -9003.025
## 
## Clustering table:
##   1   2   3   4   5   6   7 
## 245 184 175 180 266  50  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

## [1] "Class 7"

## NULL

13.7 Eight-Class Model

viz2(m8)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 8 components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4045.21 1133 39 -8364.693 -8957.662
## 
## Clustering table:
##   1   2   3   4   5   6   7   8 
## 251 212 131  24 168 264  50  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

## [1] "Class 7"

## [1] "Class 8"

## NULL

14 Final Model

Six-class model seems strongest so far Balances number of classes with entropy Does have two groups with small number of participants but they seem distinct Other fit scores are good

d_std$class <- as.factor(m6$classification)
d_std$pp <- 1 - round(m6$uncertainty, 3)

d_fin <- d_std
d_fin$class_fin <- d_fin$class
# d_fin$class_fin <- "un"
# d_fin$class_fin[d_fin$pp > .69] <- d_fin$class[d_fin$pp > .69]
table(d_fin$class_fin, useNA = "always")

## 
##    1    2    3    4    5    6 <NA> 
##  231  198  554   78   39   33    0

table(d_fin$class_fin, d_fin$class, useNA = "always")

##       
##          1   2   3   4   5   6 <NA>
##   1    231   0   0   0   0   0    0
##   2      0 198   0   0   0   0    0
##   3      0   0 554   0   0   0    0
##   4      0   0   0  78   0   0    0
##   5      0   0   0   0  39   0    0
##   6      0   0   0   0   0  33    0
##   <NA>   0   0   0   0   0   0    0

table(d_fin$class_fin, round(d_fin$pp, 1), useNA = "always")

##       
##        0.3 0.4 0.5 0.6 0.7 0.8 0.9   1 <NA>
##   1      0   2   9  26  21  26  36 111    0
##   2      0   0  11  15   1  17  44 110    0
##   3      1   3  32  53  76 113 234  42    0
##   4      0   2   4   6  11  10  14  31    0
##   5      0   1   4   4   6  13  10   1    0
##   6      0   0   0   0   0   0   7  26    0
##   <NA>   0   0   0   0   0   0   0   0    0

df_long <- gather(d_fin, key = "variable", value = "value", s_std, r_std, e_std)
summ <- summarySE(df_long, measurevar="value", groupvars = c("variable","class_fin"), na.rm = T) #the summarySE() command creates a table with the information we need for our plot

# Create a barplot of means for each column by class
ggplot(summ, aes(x = class_fin, y = value, fill = variable)) +
  geom_bar(stat = "summary", fun = "mean", position = "dodge") +
  geom_errorbar(aes(ymin=value-se, ymax=value+se),
                width=.2,
                position=position_dodge(.9))

df_long$class_fin <- as.factor(df_long$class_fin)
df_long %>%
  ggplot(aes(x=value, fill=class)) +
  geom_density(alpha=0.4, bw=.20) +
  geom_vline(xintercept=0, size=.5, color="black") +
  facet_wrap(~variable) +
  xlim(-4.5,2)

vars <- unique(df_long$variable)
n <- 1
for (i in 1:length(vars)) {
  print(vars[n])
  print(subset(df_long, variable == vars[n]) %>%
  ggplot(aes(x=value, fill=class_fin)) +
  geom_density(alpha=0.4, bw=.20) +
  geom_vline(xintercept=0, size=.5, color="black") +
  facet_wrap(~class_fin) +
  xlim(-4.5,2))
  n <- n + 1
}

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

n <- 1
for (i in 1:length(unique(df_long$class_fin))) {
  df_long$class_spec <- 0
  df_long$class_spec[df_long$class_fin == n] <- 1
  df_long$class_spec <- as.factor(df_long$class_spec)

  scatter <- ggplot(df_long, aes(x = variable, y = value, color = class_spec)) +
    geom_jitter()
  print(paste("Class",n))
  print(scatter)
  n <- n + 1
}

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

n <- 1
for (i in 1:length(unique(na.omit(df_long$class_fin)))) {
  pp <- subset(d_fin, select=c(pp), class_fin == n)
  density <- plot(density(pp$pp))
  print(paste("Class",n))
  print(density)
  n <- n + 1
}

## [1] "Class 1"
## NULL

## [1] "Class 2"
## NULL

## [1] "Class 3"
## NULL

## [1] "Class 4"
## NULL

## [1] "Class 5"
## NULL

## [1] "Class 6"
## NULL

# classes <- cbind.data.frame(
#   subset(d_fin, select=c(X, s_std,r_std,e_std)),
#   subset(d_raw, select=c(s,r,e)),
#   subset(d_fin, select=c(class,class_fin,pp)))
# 
# write.csv(classes, file="classes.csv", row.names = F)

GRADs Identity LPA

Heather Perkins

2023-10-16

1 Load Libraries

2 Load Data

3 Check Missing Data

4 Create Composites

5 Correlation Matrix

6 Factor Analysis

7 Recalculate Composites

8 Check for Outliers

9 Create Standardized Variables

10 Create Final Dataframe for Second LPA

11 Sci, Res, & Eng LPA

11.1 Entropy Plot

11.2 Calculate LRT Scores

11.3 Calculate and Plot BIC Scores

11.4 Calculate and Plot ICL Scores

11.5 Create Fit Indices Table

12 Save Current Environment

13 Visualize Models (v3)

13.1 Two-Class Model

13.2 Three-Class Model

13.3 Four-Class Model

13.4 Five-Class Model

13.5 Six-Class Model

13.6 Seven-Class Model

13.7 Eight-Class Model

14 Final Model