1 Load Libraries

library(psych) #for describe()
library(corrplot) #for corrplot()
library(nFactors) #for factor analysis
library(naniar) #for gg_miss_upset()
library(mclust) #for LPA
library(kableExtra) #for tables
library(tidyr) #for graphs
library(ggplot2) #for graphs
library(Rmisc) #for summarySE()
library(reshape2) #for data transformation
library(fmsb) #for spider plot
library(dplyr) #for data transformation

2 Load Data

import <- read.csv(file="finaldata.csv")
# limited to finished and doctoral students
import2 <- subset(import, import$Q71 == 1 & Finished == 1)

d <- subset(import2, select=c(X, grep("Q3.1", colnames(import)),
                                                         grep("Q4.1", colnames(import)),
                                                         grep("Q5.1", colnames(import))))

rm(import, import2)

3 Check Missing Data

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 45)

mcar_test(d[-1])
## # A tibble: 1 × 4
##   statistic    df p.value missing.patterns
##       <dbl> <dbl>   <dbl>            <int>
## 1      544.   514   0.178               14
# test is not significant = data MCAR

4 Create Composites

d$s_rec <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7)/6
d$s_pc <- (d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15)/5
d$s_int <- (d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10)/3
d$e_rec <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7)/5
d$e_pc <- (d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10)/5
d$e_int <- (d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/3
d$r_rec <- (d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7)/6
d$r_pc <- (d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15)/5
d$r_int <- (d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/4

5 Correlation Matrix

corrout <- corr.test(subset(d, select=c(47:55)))
corrplot.mixed(corrout$r)

6 Factor Analysis

df <- na.omit(subset(d, select=c(47:55)))

ev <- eigen(cor(df)) # get eigenvalues
ap <- parallel(subject=nrow(df),var=ncol(df),rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

EFA <- factanal(df, factors = 2, rotation = "promax", cutoff = 0.3)
print(EFA, digits=3, cutoff=.3, sort=TRUE)
## 
## Call:
## factanal(x = df, factors = 2, rotation = "promax", cutoff = 0.3)
## 
## Uniquenesses:
## s_rec  s_pc s_int e_rec  e_pc e_int r_rec  r_pc r_int 
## 0.658 0.534 0.668 0.458 0.311 0.294 0.460 0.390 0.509 
## 
## Loadings:
##       Factor1 Factor2
## s_rec  0.626         
## s_pc   0.596         
## s_int  0.569         
## r_rec  0.754         
## r_pc   0.753         
## r_int  0.711         
## e_rec          0.736 
## e_pc           0.787 
## e_int          0.863 
## 
##                Factor1 Factor2
## SS loadings      2.726   1.964
## Proportion Var   0.303   0.218
## Cumulative Var   0.303   0.521
## 
## Factor Correlations:
##         Factor1 Factor2
## Factor1   1.000   0.391
## Factor2   0.391   1.000
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 742.57 on 19 degrees of freedom.
## The p-value is 3.38e-145

7 Recalculate Composites

d$sr_rec <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7 +
               d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7)/12
d$sr_pc <- (d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15 +
              d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15)/10
d$sr_int <- (d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10 +
               d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/7
d$e_rec <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7)/5
d$e_pc <- (d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10)/5
d$e_int <- (d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/3

d$s <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7 +
        d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15 +
        d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10)/14

d$r <- (d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7 +
        d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15 +
        d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/15

d$e <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7 +
        d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10 +
        d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/17

8 Check for Outliers

d1 <- na.omit(subset(d, select=c(1, sr_rec, sr_pc, sr_int, e_rec, e_pc, e_int)))

m_dist <- mahalanobis(d1[-c(1)], colMeans(d1[-c(1)]), cov(d1[-c(1)]))
d1$MD <- round(m_dist, 1)
plot(d1$MD)
describe(m_dist)
##    vars    n mean   sd median trimmed mad  min   max range skew kurtosis   se
## X1    1 1160 5.99 5.73   3.95    4.84 2.7 0.23 38.64 38.41 2.43     7.05 0.17
cut <- qchisq(.999, df=(ncol(d1)-1))
abline(a=cut, b=0, col="red")

d1$outlier <- F
d1$outlier[d1$MD > cut] <- T
table(d1$outlier)
## 
## FALSE  TRUE 
##  1133    27
d1 <- subset(d1, outlier == F)

d2 <- subset(d, X %in% d1$X)

9 Create Standardized Variables

d2$sr_rec_std <- as.vector(scale(d2$sr_rec, center=T, scale=T))
d2$sr_pc_std <- as.vector(scale(d2$sr_pc, center=T, scale=T))
d2$sr_int_std <- as.vector(scale(d2$sr_int, center=T, scale=T))
d2$e_rec_std <- as.vector(scale(d2$e_rec, center=T, scale=T))
d2$e_pc_std <- as.vector(scale(d2$e_pc, center=T, scale=T))
d2$e_int_std <- as.vector(scale(d2$e_int, center=T, scale=T))

d2$s_std <- as.vector(scale(d2$s, center=T, scale=T))
d2$r_std <- as.vector(scale(d2$r, center=T, scale=T))
d2$e_std <- as.vector(scale(d2$e, center=T, scale=T))

desc <- describe(subset(d2, select=c(s, r, e, s_std, r_std, e_std)))
desc
##       vars    n mean   sd median trimmed  mad   min  max range  skew kurtosis
## s        1 1133 4.06 0.60   4.07    4.10 0.64  1.00 5.00  4.00 -0.60     0.49
## r        2 1133 4.26 0.61   4.33    4.32 0.69  1.13 5.00  3.87 -0.73     0.28
## e        3 1133 3.20 0.52   3.24    3.25 0.61  1.00 3.82  2.82 -0.77     0.40
## s_std    4 1133 0.00 1.00   0.01    0.06 1.06 -5.11 1.56  6.67 -0.60     0.49
## r_std    5 1133 0.00 1.00   0.12    0.10 1.13 -5.13 1.21  6.34 -0.73     0.28
## e_std    6 1133 0.00 1.00   0.06    0.10 1.18 -4.27 1.20  5.47 -0.77     0.40
##         se
## s     0.02
## r     0.02
## e     0.02
## s_std 0.03
## r_std 0.03
## e_std 0.03

10 Create Final Dataframe for Second LPA

load(file="LPAv3.RData")
d_std <- subset(d2, select=c(X, 68:70))
d_raw <- subset(d2, select=c(X, 59:61))

11 Sci, Res, & Eng LPA

11.1 Entropy Plot

#entropy plot - elbow indicates number of classes that should be considered
d <- subset(d_std, select=-c(1))
# output <- clustCombi(data = d)
output2 <- clustCombiOptim(output, plot=TRUE)

summary(output)
## ----------------------------------------------------
## Combining Gaussian mixture components for clustering 
## ----------------------------------------------------
## 
## Mclust model name: VII 
## Number of components: 8 
## 
## Combining steps:
## 
##   Step | Classes combined at this step | Class labels after this step
## -------|-------------------------------|-----------------------------
##    0   |              ---              | 1 2 3 4 5 6 7 8 
##    1   |             1 & 6             | 1 2 3 4 5 7 8 
##    2   |             1 & 5             | 1 2 3 4 7 8 
##    3   |             1 & 3             | 1 2 4 7 8 
##    4   |             1 & 2             | 1 4 7 8 
##    5   |             1 & 7             | 1 4 8 
##    6   |             1 & 4             | 1 8 
##    7   |             1 & 8             | 1
entPlot(output$MclustOutput$z, output$combiM, abc = c("normalized"))

entPlot(output$MclustOutput$z, output$combiM, abc = c("standard"))

11.2 Calculate LRT Scores

#calculate and plot LRT scores
#BLRT - bootstrap likelihood ratio test - compares same model with diff number of clases, sig indicates that model with more classes fits better than one with fewer - should be sig
# lrt <- mclustBootstrapLRT(d, mclust.options("emModelNames"))
print(lrt)
## ------------------------------------------------------------- 
## Bootstrap sequential LRT for the number of mixture components 
## ------------------------------------------------------------- 
## Model        = EII 
## Replications = 999 
##                  LRTS bootstrap p-value
## 1 vs 2   4.504357e+02             0.001
## 2 vs 3   7.954769e+01             0.001
## 3 vs 4   1.784418e+02             0.001
## 4 vs 5   7.649047e+01             0.001
## 5 vs 6   5.237173e+01             0.001
## 6 vs 7   5.490722e+01             0.001
## 7 vs 8   4.971794e+01             0.001
## 8 vs 9   1.254765e-03             0.803

11.3 Calculate and Plot BIC Scores

#calculate and plot BIC scores
#sample adjusted BIC - in the literature, lowest BIC equals best fitting model (specific to Mplus), in R higher BIC is better
# BIC <- mclustBIC(d, G = 1:9, mclust.options("emModelNames"))
plot.mclustBIC(BIC)

11.4 Calculate and Plot ICL Scores

#calculate ICL scores
#BIC penalized by estimated mean entropy, not great for datasets with overlapping, higher BIC is better
# icl <- mclustICL(d, G = 1:9, mclust.options("emModelNames"))
plot.mclustICL(icl)

11.5 Create Fit Indices Table

# # calculate stuff for table
# n <- 1
# p <- 1
# q <- 1
# for (i in 1:9) {
#   #run model
#   varname <- paste("m",n,sep="")
#   m <- Mclust(d, G = n, mclust.options("emModelNames"))
#   assign(varname,m)
#   #extract ll
#   varname3 <- paste("logl",n,sep="")
#   ll <- eval(parse(text=paste("m",n,"$loglik",sep = "")))
#   assign(varname3,ll)
#   #calc free parameters
#   #dimension of the data = number of original variables in model
#   #EII 1
#   #VII G
#   #EEI p
#   #free covariance parameters, VEI p + G - 1
#   #EVI pG - G + 1
#   #VVI pG
#   #EEE p(p + 1)/2
#   #free covariance parameters, EEV Gp(p + 1)/2 - (G - 1)p
#   #free covariance parameters, VEV Gp(p + 1)/2 - (G - 1)(p - 1)
#   #VVV Gp(p + 1)/2
#   #EVE p(p + 1)/2 + (G - 1)(p - 1)
#   #VVE p(p + 1)/2 + (G - 1)p
#   #VEE p(p + 1)/2 + (G - 1)
#   #EVV Gp(p + 1)/2 - (G - 1)
#   if (m$modelName == "EII") {
#     freepar <- 1
#   } else if (m$modelName == "VII") {
#     freepar <- m$G
#   } else if (m$modelName == "EEI") {
#     freepar <- m$d
#   } else if (m$modelName == "VEI") {
#     freepar <- m$d + m$G - 1
#   } else if (m$modelName == "EVI") {
#     freepar <- m$d*m$G - m$G + 1
#   } else if (m$modelName == "VVI") {
#     freepar <- m$d*m$G
#   } else if (m$modelName == "EEE") {
#     freepar <- m$d(m$d + 1)/2
#   } else if (m$modelName == "EEV") {
#     freepar <- m$G*m$d*(m$d + 1)/2 - (m$G - 1)*m$d
#   } else if (m$modelName == "VEV") {
#     freepar <- m$G*m2$d*(m$d + 1)/2 - (m$G - 1)*(m$d - 1)
#   } else if (m$modelName == "VVV") {
#     freepar <- m$G*m$d*(m$d + 1)/2
#   } else if (m$modelName == "XXX") {
#     freepar <- "-"
#   } else if (m$modelName == "EVE") {
#     freepar <- m$d*(m$d + 1)/2 + (m$G - 1)*(m$d - 1)
#   } else if (m$modelName == "VVE") {
#     freepar <- m$d*(m$d + 1)/2 + (m$G - 1)*m$d
#   } else if (m$modelName == "VEE") {
#     freepar <- m$d*(m$d + 1)/2 + (m$G - 1)
#   } else if (m$modelName == "EVV") {
#     freepar <- m$G*m$d*(m$d + 1)/2 - (m$G - 1)
#   } else {
#     freepar <- 999
#   }
#   varname4 <- paste("freep",n,sep="")
#   assign(varname4,freepar)
#   #cal post prob
#   m <- eval(parse(text=paste("m",n,sep="")))
#   uncer <- m$uncertainty
#   posterior_prob <- 1 - round(uncer, 5)
#   #cal entropy
#   ent <- round(mean(posterior_prob), 3)
#   varname2 <- paste("ent",n,sep="")
#   assign(varname2, ent)
#   #AIC <- (2 * mcm$df - 2 * mcm$loglik)
#   varname5 <- paste("aic",n,sep="")
#   aic <- (2 * m$df - 2 * m$loglik)
#   assign(varname5,aic)
#   #CAIC <- (((log(mcm$n) + 1) * mcm$df) - 2 * mcm$loglik)
#   varname6 <- paste("caic",n,sep="")
#   caic <- (((log(m$n) + 1) * m$df) - 2 * m$loglik)
#   assign(varname6,caic)
#   #SABIC <- (mcm$df * log((mcm$n + 2)/24) - 2 * mcm$loglik)
#   varname7 <- paste("sabic",n,sep="")
#   sabic <- (m$df * log((m$n + 2)/24) - 2 * m$loglik)
#   assign(varname7, sabic)
#   #iterate
#   n <- n+1
#   p <- p+1
#   #grab group sizes
#   grps <- table(eval(parse(text=paste("m",q,"$classification",sep=""))))
#   tg <- as.list(sort(unique(eval(parse(text=paste("m",q,"$classification",sep=""))))))
#   t <- 1
#   #calculate group percents
#   for (i in tg) {
#     perc <- grps[[t]] / (eval(parse(text=paste("m",i,"$n[1]",sep=""))))
#     varname <- paste("m",q,"perc",t,sep="")
#     assign(varname,perc)
#     t <- t + 1
#   }
#   #combine all group percents into one vector
#   mp <- mapply(get, grep(paste("m",q,"perc",sep=""), ls(), value=T))
#   varname <- paste("allperc",q,sep="")
#   assign(varname,mp)
#   q <- q + 1
# }

#throw stuff together for the table
proflist <- sprintf("Profiles %d", 1:9)
modlist <- rbind(m1$modelName, m2$modelName, m3$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName)
#fix BLRT columns
  blank <- "-"
  lrtobs <- "-"
  lrtobs <- append(lrtobs, round(lrt$obs[1:7], digits = 2))
  lrtobs <- append(lrtobs, blank)
  lrtp <- "-"
  lrtp <- append(lrtp, lrt$p.value[1:7])
  lrtp <- append(lrtp, blank)
bicval <- round(BIC[1:9], digits = 2)
iclval <- round(icl[1:9], digits = 2)
listent <- round(rbind(ent1,ent2,ent3,ent4,ent5,ent6,ent7,ent8,ent9), digits = 2)
listlog <- round(rbind(logl1,logl2,logl3,logl4,logl5,logl6,logl7,logl8,logl9), digits = 2)
listfreep <- rbind(freep1,freep2,freep3,freep4,freep5,freep6,freep7,freep8,freep9)
listaic <- round(rbind(aic1,aic2,aic3,aic4,aic5,aic6,aic7,aic8,aic9), digits = 2)
listcaic <- round(rbind(caic1,caic2,caic3,caic4,caic5,caic6,caic7,caic8,caic9), digits = 2)
listsabic <- round(rbind(sabic1,sabic2,sabic3,sabic4,sabic5,sabic6,sabic7,sabic8,sabic9), digits = 2)
listperc <- rbind(
  sum(allperc1 < .05),
  sum(allperc2 < .05),
  sum(allperc3 < .05),
  sum(allperc4 < .05),
  sum(allperc5 < .05),
  sum(allperc6 < .05),
  sum(allperc7 < .05),
  sum(allperc8 < .05),
  sum(allperc9 < .05))

indicestab <- cbind(modlist,listlog,listfreep,listaic,listcaic,bicval,listsabic,iclval,lrtobs,lrtp,listent,listperc)
colnames(indicestab) <- c("Model","LL","#FP","AIC","CAIC","BIC","SABIC","ICL","BLRTS","BLRTS p","Entropy","<5%")
rownames(indicestab) <- proflist

indicestab %>%
    kbl() %>%
    kable_styling(font_size = 11)
Model LL #FP AIC CAIC BIC SABIC ICL BLRTS BLRTS p Entropy <5%
Profiles 1 XXX -4519.04
9056.07 9110.37 -9671.07 9072.78 -9671.07
1 0
Profiles 2 VEV -4285.58 10 8605.16 8707.71 -9248.77 8636.72 -9541 450.44 0.001 0.98 0
Profiles 3 VEV -4218.65 14 8487.3 8638.12 -9197.35 8533.71 -9535.1 79.55 0.001 0.84 0
Profiles 4 VEV -4128.84 18 8323.68 8522.75 -9047.04 8384.94 -9529.21 178.44 0.001 0.77 0
Profiles 5 VEV -4123.64 5 8295.29 8440.07 -8998.68 8339.84 -9456.41 76.49 0.001 0.85 1
Profiles 6 VEV -4099.49 6 8256.98 8431.93 -8974.44 8310.82 -9407.92 52.37 0.001 0.84 2
Profiles 7 VEV -4080.04 7 8228.07 8433.18 -8947.66 8291.19 -9448.65 54.91 0.001 0.79 2
Profiles 8 VEV -4045.21 8 8168.42 8403.69 -8926.07 8240.82 -9508.31 49.72 0.001 0.8 3
Profiles 9 VEV -4009.91 46 8181.82 8670.46 -8954.2 8332.18 -9648.95
0.76 2

12 Save Current Environment

# save.image(file="LPAv3.RData")

13 Visualize Models (v3)

viz2 <- function(orig){
  mcm <- orig
  print(summary.Mclust(mcm))
  d_std$class <- mcm$classification
  
  df_long <- gather(d_std, key = "variable", value = "value", s_std, r_std, e_std)
  summ <- summarySE(df_long, measurevar="value", groupvars = c("variable","class"), na.rm = T) #the summarySE() command creates a table with the information we need for our plot
  
  # Create a barplot of means for each column by class
  means <- ggplot(summ, aes(x = class, y = value, fill = variable)) +
    geom_bar(stat = "summary", fun = "mean", position = "dodge") +
    geom_errorbar(aes(ymin=value-se, ymax=value+se),
                  width=.2,
                  position=position_dodge(.9))
  print(means)
  
  df_long$class <- as.factor(df_long$class)
  dist <- df_long %>%
    ggplot(aes(x=value, fill=class)) +
    geom_density(alpha=0.4, bw=.20) +
    geom_vline(xintercept=0, size=.5, color="black") +
    facet_wrap(~variable) +
    xlim(-4.5,2)
  print(dist)
  
  vars <- unique(df_long$variable)
  n <- 1
  for (i in 1:length(vars)) {
    print(vars[n])
    print(subset(df_long, variable == vars[n]) %>%
    ggplot(aes(x=value, fill=class)) +
    geom_density(alpha=0.4, bw=.20) +
    geom_vline(xintercept=0, size=.5, color="black") +
    facet_wrap(~class) +
    xlim(-4.5,2))
    n <- n + 1
  }
  
  n <- 1
  for (i in 1:length(unique(df_long$class))) {
    df_long$class_spec <- 0
    df_long$class_spec[d_std$class == n] <- 1
    df_long$class_spec <- as.factor(df_long$class_spec)
    
    scatter <- ggplot(df_long, aes(x = variable, y = value, color = class_spec)) +
      geom_jitter()
    print(paste("Class",n))
    print(scatter)
    n <- n + 1
  }
  
  class <- mcm$classification
  uncer <- mcm$uncertainty
  posterior_prob <- 1 - round(uncer, 5)
  density <- plot(density(posterior_prob))
  print(density)
}

13.1 Two-Class Model

viz2(m2)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 2 components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4285.58 1133 17 -8690.714 -8738.077
## 
## Clustering table:
##    1    2 
## 1001  132

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## NULL

13.2 Three-Class Model

viz2(m3)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 3 components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4218.65 1133 25 -8613.116 -9068.174
## 
## Clustering table:
##   1   2   3 
## 634 374 125

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## NULL

13.3 Four-Class Model

viz2(m4)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 4 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -4128.839 1133 33 -8489.754 -9145.533
## 
## Clustering table:
##   1   2   3   4 
## 247 251 516 119

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## NULL

13.4 Five-Class Model

viz2(m5)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 5 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -4123.645 1133 24 -8416.073 -8812.255
## 
## Clustering table:
##   1   2   3   4   5 
## 226 179 630  65  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## NULL

13.5 Six-Class Model

viz2(m6)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 6 components: 
## 
##  log-likelihood    n df      BIC       ICL
##       -4099.492 1133 29 -8402.93 -8845.879
## 
## Clustering table:
##   1   2   3   4   5   6 
## 231 198 554  78  39  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

## NULL

13.6 Seven-Class Model

viz2(m7)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 7 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -4080.037 1133 34 -8399.183 -9003.025
## 
## Clustering table:
##   1   2   3   4   5   6   7 
## 245 184 175 180 266  50  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

## [1] "Class 7"

## NULL

13.7 Eight-Class Model

viz2(m8)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 8 components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4045.21 1133 39 -8364.693 -8957.662
## 
## Clustering table:
##   1   2   3   4   5   6   7   8 
## 251 212 131  24 168 264  50  33

## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

## [1] "Class 7"

## [1] "Class 8"

## NULL

14 Final Model

Six-class model seems strongest so far Balances number of classes with entropy Does have two groups with small number of participants but they seem distinct Other fit scores are good

d_std$class <- as.factor(m6$classification)
d_std$pp <- 1 - round(m6$uncertainty, 3)

d_fin <- d_std
d_fin$class_fin <- d_fin$class
# d_fin$class_fin <- "un"
# d_fin$class_fin[d_fin$pp > .69] <- d_fin$class[d_fin$pp > .69]
table(d_fin$class_fin, useNA = "always")
## 
##    1    2    3    4    5    6 <NA> 
##  231  198  554   78   39   33    0
table(d_fin$class_fin, d_fin$class, useNA = "always")
##       
##          1   2   3   4   5   6 <NA>
##   1    231   0   0   0   0   0    0
##   2      0 198   0   0   0   0    0
##   3      0   0 554   0   0   0    0
##   4      0   0   0  78   0   0    0
##   5      0   0   0   0  39   0    0
##   6      0   0   0   0   0  33    0
##   <NA>   0   0   0   0   0   0    0
table(d_fin$class_fin, round(d_fin$pp, 1), useNA = "always")
##       
##        0.3 0.4 0.5 0.6 0.7 0.8 0.9   1 <NA>
##   1      0   2   9  26  21  26  36 111    0
##   2      0   0  11  15   1  17  44 110    0
##   3      1   3  32  53  76 113 234  42    0
##   4      0   2   4   6  11  10  14  31    0
##   5      0   1   4   4   6  13  10   1    0
##   6      0   0   0   0   0   0   7  26    0
##   <NA>   0   0   0   0   0   0   0   0    0
df_long <- gather(d_fin, key = "variable", value = "value", s_std, r_std, e_std)
summ <- summarySE(df_long, measurevar="value", groupvars = c("variable","class_fin"), na.rm = T) #the summarySE() command creates a table with the information we need for our plot

# Create a barplot of means for each column by class
ggplot(summ, aes(x = class_fin, y = value, fill = variable)) +
  geom_bar(stat = "summary", fun = "mean", position = "dodge") +
  geom_errorbar(aes(ymin=value-se, ymax=value+se),
                width=.2,
                position=position_dodge(.9))

df_long$class_fin <- as.factor(df_long$class_fin)
df_long %>%
  ggplot(aes(x=value, fill=class)) +
  geom_density(alpha=0.4, bw=.20) +
  geom_vline(xintercept=0, size=.5, color="black") +
  facet_wrap(~variable) +
  xlim(-4.5,2)

vars <- unique(df_long$variable)
n <- 1
for (i in 1:length(vars)) {
  print(vars[n])
  print(subset(df_long, variable == vars[n]) %>%
  ggplot(aes(x=value, fill=class_fin)) +
  geom_density(alpha=0.4, bw=.20) +
  geom_vline(xintercept=0, size=.5, color="black") +
  facet_wrap(~class_fin) +
  xlim(-4.5,2))
  n <- n + 1
}
## [1] "s_std"

## [1] "r_std"

## [1] "e_std"

n <- 1
for (i in 1:length(unique(df_long$class_fin))) {
  df_long$class_spec <- 0
  df_long$class_spec[df_long$class_fin == n] <- 1
  df_long$class_spec <- as.factor(df_long$class_spec)

  scatter <- ggplot(df_long, aes(x = variable, y = value, color = class_spec)) +
    geom_jitter()
  print(paste("Class",n))
  print(scatter)
  n <- n + 1
}
## [1] "Class 1"

## [1] "Class 2"

## [1] "Class 3"

## [1] "Class 4"

## [1] "Class 5"

## [1] "Class 6"

n <- 1
for (i in 1:length(unique(na.omit(df_long$class_fin)))) {
  pp <- subset(d_fin, select=c(pp), class_fin == n)
  density <- plot(density(pp$pp))
  print(paste("Class",n))
  print(density)
  n <- n + 1
}

## [1] "Class 1"
## NULL

## [1] "Class 2"
## NULL

## [1] "Class 3"
## NULL

## [1] "Class 4"
## NULL

## [1] "Class 5"
## NULL

## [1] "Class 6"
## NULL
# classes <- cbind.data.frame(
#   subset(d_fin, select=c(X, s_std,r_std,e_std)),
#   subset(d_raw, select=c(s,r,e)),
#   subset(d_fin, select=c(class,class_fin,pp)))
# 
# write.csv(classes, file="classes.csv", row.names = F)