library(psych) #for describe()
library(corrplot) #for corrplot()
library(nFactors) #for factor analysis
library(naniar) #for gg_miss_upset()
library(mclust) #for LPA
library(kableExtra) #for tables
library(tidyr) #for graphs
library(ggplot2) #for graphs
library(Rmisc) #for summarySE()
library(reshape2) #for data transformation
library(fmsb) #for spider plot
library(dplyr) #for data transformation
import <- read.csv(file="finaldata.csv")
# limited to finished and doctoral students
import2 <- subset(import, import$Q71 == 1 & Finished == 1)
d <- subset(import2, select=c(X, grep("Q3.1", colnames(import)),
grep("Q4.1", colnames(import)),
grep("Q5.1", colnames(import))))
rm(import, import2)
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 45)
mcar_test(d[-1])
## # A tibble: 1 × 4
## statistic df p.value missing.patterns
## <dbl> <dbl> <dbl> <int>
## 1 544. 514 0.178 14
# test is not significant = data MCAR
d$s_rec <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7)/6
d$s_pc <- (d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15)/5
d$s_int <- (d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10)/3
d$e_rec <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7)/5
d$e_pc <- (d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10)/5
d$e_int <- (d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/3
d$r_rec <- (d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7)/6
d$r_pc <- (d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15)/5
d$r_int <- (d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/4
corrout <- corr.test(subset(d, select=c(47:55)))
corrplot.mixed(corrout$r)
df <- na.omit(subset(d, select=c(47:55)))
ev <- eigen(cor(df)) # get eigenvalues
ap <- parallel(subject=nrow(df),var=ncol(df),rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)
EFA <- factanal(df, factors = 2, rotation = "promax", cutoff = 0.3)
print(EFA, digits=3, cutoff=.3, sort=TRUE)
##
## Call:
## factanal(x = df, factors = 2, rotation = "promax", cutoff = 0.3)
##
## Uniquenesses:
## s_rec s_pc s_int e_rec e_pc e_int r_rec r_pc r_int
## 0.658 0.534 0.668 0.458 0.311 0.294 0.460 0.390 0.509
##
## Loadings:
## Factor1 Factor2
## s_rec 0.626
## s_pc 0.596
## s_int 0.569
## r_rec 0.754
## r_pc 0.753
## r_int 0.711
## e_rec 0.736
## e_pc 0.787
## e_int 0.863
##
## Factor1 Factor2
## SS loadings 2.726 1.964
## Proportion Var 0.303 0.218
## Cumulative Var 0.303 0.521
##
## Factor Correlations:
## Factor1 Factor2
## Factor1 1.000 0.391
## Factor2 0.391 1.000
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 742.57 on 19 degrees of freedom.
## The p-value is 3.38e-145
d$sr_rec <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7 +
d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7)/12
d$sr_pc <- (d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15 +
d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15)/10
d$sr_int <- (d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10 +
d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/7
d$e_rec <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7)/5
d$e_pc <- (d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10)/5
d$e_int <- (d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/3
d$s <- (d$Q3.1_2 + d$Q3.1_3 + d$Q3.1_4 + d$Q3.1_5 + d$Q3.1_6 + d$Q3.1_7 +
d$Q3.1_11 + d$Q3.1_12 + d$Q3.1_13 + d$Q3.1_14 + d$Q3.1_15 +
d$Q3.1_8 + d$Q3.1_9 + d$Q3.1_10)/14
d$r <- (d$Q5.1_2 + d$Q5.1_3 + d$Q5.1_4 + d$Q5.1_5 + d$Q5.1_6 + d$Q5.1_7 +
d$Q5.1_16 + d$Q5.1_12 + d$Q5.1_13 + d$Q5.1_14 + d$Q5.1_15 +
d$Q5.1_8 + d$Q5.1_9 + d$Q5.1_10 + d$Q5.1_11)/15
d$e <- (d$Q4.1_2 + d$Q4.1_3 + d$Q4.1_4 + d$Q4.1_6 + d$Q4.1_7 +
d$Q4.1_11 + d$Q4.1_12 + d$Q4.1_13 + d$Q4.1_14 + d$Q4.1_10 +
d$Q4.1_8 + d$Q4.1_9 + d$Q4.1_5)/17
d1 <- na.omit(subset(d, select=c(1, sr_rec, sr_pc, sr_int, e_rec, e_pc, e_int)))
m_dist <- mahalanobis(d1[-c(1)], colMeans(d1[-c(1)]), cov(d1[-c(1)]))
d1$MD <- round(m_dist, 1)
plot(d1$MD)
describe(m_dist)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1160 5.99 5.73 3.95 4.84 2.7 0.23 38.64 38.41 2.43 7.05 0.17
cut <- qchisq(.999, df=(ncol(d1)-1))
abline(a=cut, b=0, col="red")
d1$outlier <- F
d1$outlier[d1$MD > cut] <- T
table(d1$outlier)
##
## FALSE TRUE
## 1133 27
d1 <- subset(d1, outlier == F)
d2 <- subset(d, X %in% d1$X)
d2$sr_rec_std <- as.vector(scale(d2$sr_rec, center=T, scale=T))
d2$sr_pc_std <- as.vector(scale(d2$sr_pc, center=T, scale=T))
d2$sr_int_std <- as.vector(scale(d2$sr_int, center=T, scale=T))
d2$e_rec_std <- as.vector(scale(d2$e_rec, center=T, scale=T))
d2$e_pc_std <- as.vector(scale(d2$e_pc, center=T, scale=T))
d2$e_int_std <- as.vector(scale(d2$e_int, center=T, scale=T))
d2$s_std <- as.vector(scale(d2$s, center=T, scale=T))
d2$r_std <- as.vector(scale(d2$r, center=T, scale=T))
d2$e_std <- as.vector(scale(d2$e, center=T, scale=T))
desc <- describe(subset(d2, select=c(s, r, e, s_std, r_std, e_std)))
desc
## vars n mean sd median trimmed mad min max range skew kurtosis
## s 1 1133 4.06 0.60 4.07 4.10 0.64 1.00 5.00 4.00 -0.60 0.49
## r 2 1133 4.26 0.61 4.33 4.32 0.69 1.13 5.00 3.87 -0.73 0.28
## e 3 1133 3.20 0.52 3.24 3.25 0.61 1.00 3.82 2.82 -0.77 0.40
## s_std 4 1133 0.00 1.00 0.01 0.06 1.06 -5.11 1.56 6.67 -0.60 0.49
## r_std 5 1133 0.00 1.00 0.12 0.10 1.13 -5.13 1.21 6.34 -0.73 0.28
## e_std 6 1133 0.00 1.00 0.06 0.10 1.18 -4.27 1.20 5.47 -0.77 0.40
## se
## s 0.02
## r 0.02
## e 0.02
## s_std 0.03
## r_std 0.03
## e_std 0.03
load(file="LPAv3.RData")
d_std <- subset(d2, select=c(X, 68:70))
d_raw <- subset(d2, select=c(X, 59:61))
#entropy plot - elbow indicates number of classes that should be considered
d <- subset(d_std, select=-c(1))
# output <- clustCombi(data = d)
output2 <- clustCombiOptim(output, plot=TRUE)
summary(output)
## ----------------------------------------------------
## Combining Gaussian mixture components for clustering
## ----------------------------------------------------
##
## Mclust model name: VII
## Number of components: 8
##
## Combining steps:
##
## Step | Classes combined at this step | Class labels after this step
## -------|-------------------------------|-----------------------------
## 0 | --- | 1 2 3 4 5 6 7 8
## 1 | 1 & 6 | 1 2 3 4 5 7 8
## 2 | 1 & 5 | 1 2 3 4 7 8
## 3 | 1 & 3 | 1 2 4 7 8
## 4 | 1 & 2 | 1 4 7 8
## 5 | 1 & 7 | 1 4 8
## 6 | 1 & 4 | 1 8
## 7 | 1 & 8 | 1
entPlot(output$MclustOutput$z, output$combiM, abc = c("normalized"))
entPlot(output$MclustOutput$z, output$combiM, abc = c("standard"))
#calculate and plot LRT scores
#BLRT - bootstrap likelihood ratio test - compares same model with diff number of clases, sig indicates that model with more classes fits better than one with fewer - should be sig
# lrt <- mclustBootstrapLRT(d, mclust.options("emModelNames"))
print(lrt)
## -------------------------------------------------------------
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model = EII
## Replications = 999
## LRTS bootstrap p-value
## 1 vs 2 4.504357e+02 0.001
## 2 vs 3 7.954769e+01 0.001
## 3 vs 4 1.784418e+02 0.001
## 4 vs 5 7.649047e+01 0.001
## 5 vs 6 5.237173e+01 0.001
## 6 vs 7 5.490722e+01 0.001
## 7 vs 8 4.971794e+01 0.001
## 8 vs 9 1.254765e-03 0.803
#calculate and plot BIC scores
#sample adjusted BIC - in the literature, lowest BIC equals best fitting model (specific to Mplus), in R higher BIC is better
# BIC <- mclustBIC(d, G = 1:9, mclust.options("emModelNames"))
plot.mclustBIC(BIC)
#calculate ICL scores
#BIC penalized by estimated mean entropy, not great for datasets with overlapping, higher BIC is better
# icl <- mclustICL(d, G = 1:9, mclust.options("emModelNames"))
plot.mclustICL(icl)
# # calculate stuff for table
# n <- 1
# p <- 1
# q <- 1
# for (i in 1:9) {
# #run model
# varname <- paste("m",n,sep="")
# m <- Mclust(d, G = n, mclust.options("emModelNames"))
# assign(varname,m)
# #extract ll
# varname3 <- paste("logl",n,sep="")
# ll <- eval(parse(text=paste("m",n,"$loglik",sep = "")))
# assign(varname3,ll)
# #calc free parameters
# #dimension of the data = number of original variables in model
# #EII 1
# #VII G
# #EEI p
# #free covariance parameters, VEI p + G - 1
# #EVI pG - G + 1
# #VVI pG
# #EEE p(p + 1)/2
# #free covariance parameters, EEV Gp(p + 1)/2 - (G - 1)p
# #free covariance parameters, VEV Gp(p + 1)/2 - (G - 1)(p - 1)
# #VVV Gp(p + 1)/2
# #EVE p(p + 1)/2 + (G - 1)(p - 1)
# #VVE p(p + 1)/2 + (G - 1)p
# #VEE p(p + 1)/2 + (G - 1)
# #EVV Gp(p + 1)/2 - (G - 1)
# if (m$modelName == "EII") {
# freepar <- 1
# } else if (m$modelName == "VII") {
# freepar <- m$G
# } else if (m$modelName == "EEI") {
# freepar <- m$d
# } else if (m$modelName == "VEI") {
# freepar <- m$d + m$G - 1
# } else if (m$modelName == "EVI") {
# freepar <- m$d*m$G - m$G + 1
# } else if (m$modelName == "VVI") {
# freepar <- m$d*m$G
# } else if (m$modelName == "EEE") {
# freepar <- m$d(m$d + 1)/2
# } else if (m$modelName == "EEV") {
# freepar <- m$G*m$d*(m$d + 1)/2 - (m$G - 1)*m$d
# } else if (m$modelName == "VEV") {
# freepar <- m$G*m2$d*(m$d + 1)/2 - (m$G - 1)*(m$d - 1)
# } else if (m$modelName == "VVV") {
# freepar <- m$G*m$d*(m$d + 1)/2
# } else if (m$modelName == "XXX") {
# freepar <- "-"
# } else if (m$modelName == "EVE") {
# freepar <- m$d*(m$d + 1)/2 + (m$G - 1)*(m$d - 1)
# } else if (m$modelName == "VVE") {
# freepar <- m$d*(m$d + 1)/2 + (m$G - 1)*m$d
# } else if (m$modelName == "VEE") {
# freepar <- m$d*(m$d + 1)/2 + (m$G - 1)
# } else if (m$modelName == "EVV") {
# freepar <- m$G*m$d*(m$d + 1)/2 - (m$G - 1)
# } else {
# freepar <- 999
# }
# varname4 <- paste("freep",n,sep="")
# assign(varname4,freepar)
# #cal post prob
# m <- eval(parse(text=paste("m",n,sep="")))
# uncer <- m$uncertainty
# posterior_prob <- 1 - round(uncer, 5)
# #cal entropy
# ent <- round(mean(posterior_prob), 3)
# varname2 <- paste("ent",n,sep="")
# assign(varname2, ent)
# #AIC <- (2 * mcm$df - 2 * mcm$loglik)
# varname5 <- paste("aic",n,sep="")
# aic <- (2 * m$df - 2 * m$loglik)
# assign(varname5,aic)
# #CAIC <- (((log(mcm$n) + 1) * mcm$df) - 2 * mcm$loglik)
# varname6 <- paste("caic",n,sep="")
# caic <- (((log(m$n) + 1) * m$df) - 2 * m$loglik)
# assign(varname6,caic)
# #SABIC <- (mcm$df * log((mcm$n + 2)/24) - 2 * mcm$loglik)
# varname7 <- paste("sabic",n,sep="")
# sabic <- (m$df * log((m$n + 2)/24) - 2 * m$loglik)
# assign(varname7, sabic)
# #iterate
# n <- n+1
# p <- p+1
# #grab group sizes
# grps <- table(eval(parse(text=paste("m",q,"$classification",sep=""))))
# tg <- as.list(sort(unique(eval(parse(text=paste("m",q,"$classification",sep=""))))))
# t <- 1
# #calculate group percents
# for (i in tg) {
# perc <- grps[[t]] / (eval(parse(text=paste("m",i,"$n[1]",sep=""))))
# varname <- paste("m",q,"perc",t,sep="")
# assign(varname,perc)
# t <- t + 1
# }
# #combine all group percents into one vector
# mp <- mapply(get, grep(paste("m",q,"perc",sep=""), ls(), value=T))
# varname <- paste("allperc",q,sep="")
# assign(varname,mp)
# q <- q + 1
# }
#throw stuff together for the table
proflist <- sprintf("Profiles %d", 1:9)
modlist <- rbind(m1$modelName, m2$modelName, m3$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName, m2$modelName)
#fix BLRT columns
blank <- "-"
lrtobs <- "-"
lrtobs <- append(lrtobs, round(lrt$obs[1:7], digits = 2))
lrtobs <- append(lrtobs, blank)
lrtp <- "-"
lrtp <- append(lrtp, lrt$p.value[1:7])
lrtp <- append(lrtp, blank)
bicval <- round(BIC[1:9], digits = 2)
iclval <- round(icl[1:9], digits = 2)
listent <- round(rbind(ent1,ent2,ent3,ent4,ent5,ent6,ent7,ent8,ent9), digits = 2)
listlog <- round(rbind(logl1,logl2,logl3,logl4,logl5,logl6,logl7,logl8,logl9), digits = 2)
listfreep <- rbind(freep1,freep2,freep3,freep4,freep5,freep6,freep7,freep8,freep9)
listaic <- round(rbind(aic1,aic2,aic3,aic4,aic5,aic6,aic7,aic8,aic9), digits = 2)
listcaic <- round(rbind(caic1,caic2,caic3,caic4,caic5,caic6,caic7,caic8,caic9), digits = 2)
listsabic <- round(rbind(sabic1,sabic2,sabic3,sabic4,sabic5,sabic6,sabic7,sabic8,sabic9), digits = 2)
listperc <- rbind(
sum(allperc1 < .05),
sum(allperc2 < .05),
sum(allperc3 < .05),
sum(allperc4 < .05),
sum(allperc5 < .05),
sum(allperc6 < .05),
sum(allperc7 < .05),
sum(allperc8 < .05),
sum(allperc9 < .05))
indicestab <- cbind(modlist,listlog,listfreep,listaic,listcaic,bicval,listsabic,iclval,lrtobs,lrtp,listent,listperc)
colnames(indicestab) <- c("Model","LL","#FP","AIC","CAIC","BIC","SABIC","ICL","BLRTS","BLRTS p","Entropy","<5%")
rownames(indicestab) <- proflist
indicestab %>%
kbl() %>%
kable_styling(font_size = 11)
| Model | LL | #FP | AIC | CAIC | BIC | SABIC | ICL | BLRTS | BLRTS p | Entropy | <5% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Profiles 1 | XXX | -4519.04 |
|
9056.07 | 9110.37 | -9671.07 | 9072.78 | -9671.07 |
|
|
1 | 0 |
| Profiles 2 | VEV | -4285.58 | 10 | 8605.16 | 8707.71 | -9248.77 | 8636.72 | -9541 | 450.44 | 0.001 | 0.98 | 0 |
| Profiles 3 | VEV | -4218.65 | 14 | 8487.3 | 8638.12 | -9197.35 | 8533.71 | -9535.1 | 79.55 | 0.001 | 0.84 | 0 |
| Profiles 4 | VEV | -4128.84 | 18 | 8323.68 | 8522.75 | -9047.04 | 8384.94 | -9529.21 | 178.44 | 0.001 | 0.77 | 0 |
| Profiles 5 | VEV | -4123.64 | 5 | 8295.29 | 8440.07 | -8998.68 | 8339.84 | -9456.41 | 76.49 | 0.001 | 0.85 | 1 |
| Profiles 6 | VEV | -4099.49 | 6 | 8256.98 | 8431.93 | -8974.44 | 8310.82 | -9407.92 | 52.37 | 0.001 | 0.84 | 2 |
| Profiles 7 | VEV | -4080.04 | 7 | 8228.07 | 8433.18 | -8947.66 | 8291.19 | -9448.65 | 54.91 | 0.001 | 0.79 | 2 |
| Profiles 8 | VEV | -4045.21 | 8 | 8168.42 | 8403.69 | -8926.07 | 8240.82 | -9508.31 | 49.72 | 0.001 | 0.8 | 3 |
| Profiles 9 | VEV | -4009.91 | 46 | 8181.82 | 8670.46 | -8954.2 | 8332.18 | -9648.95 |
|
|
0.76 | 2 |
# save.image(file="LPAv3.RData")
viz2 <- function(orig){
mcm <- orig
print(summary.Mclust(mcm))
d_std$class <- mcm$classification
df_long <- gather(d_std, key = "variable", value = "value", s_std, r_std, e_std)
summ <- summarySE(df_long, measurevar="value", groupvars = c("variable","class"), na.rm = T) #the summarySE() command creates a table with the information we need for our plot
# Create a barplot of means for each column by class
means <- ggplot(summ, aes(x = class, y = value, fill = variable)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
geom_errorbar(aes(ymin=value-se, ymax=value+se),
width=.2,
position=position_dodge(.9))
print(means)
df_long$class <- as.factor(df_long$class)
dist <- df_long %>%
ggplot(aes(x=value, fill=class)) +
geom_density(alpha=0.4, bw=.20) +
geom_vline(xintercept=0, size=.5, color="black") +
facet_wrap(~variable) +
xlim(-4.5,2)
print(dist)
vars <- unique(df_long$variable)
n <- 1
for (i in 1:length(vars)) {
print(vars[n])
print(subset(df_long, variable == vars[n]) %>%
ggplot(aes(x=value, fill=class)) +
geom_density(alpha=0.4, bw=.20) +
geom_vline(xintercept=0, size=.5, color="black") +
facet_wrap(~class) +
xlim(-4.5,2))
n <- n + 1
}
n <- 1
for (i in 1:length(unique(df_long$class))) {
df_long$class_spec <- 0
df_long$class_spec[d_std$class == n] <- 1
df_long$class_spec <- as.factor(df_long$class_spec)
scatter <- ggplot(df_long, aes(x = variable, y = value, color = class_spec)) +
geom_jitter()
print(paste("Class",n))
print(scatter)
n <- n + 1
}
class <- mcm$classification
uncer <- mcm$uncertainty
posterior_prob <- 1 - round(uncer, 5)
density <- plot(density(posterior_prob))
print(density)
}
viz2(m2)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 2 components:
##
## log-likelihood n df BIC ICL
## -4285.58 1133 17 -8690.714 -8738.077
##
## Clustering table:
## 1 2
## 1001 132
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## NULL
viz2(m3)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -4218.65 1133 25 -8613.116 -9068.174
##
## Clustering table:
## 1 2 3
## 634 374 125
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## NULL
viz2(m4)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 4 components:
##
## log-likelihood n df BIC ICL
## -4128.839 1133 33 -8489.754 -9145.533
##
## Clustering table:
## 1 2 3 4
## 247 251 516 119
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## [1] "Class 4"
## NULL
viz2(m5)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 5 components:
##
## log-likelihood n df BIC ICL
## -4123.645 1133 24 -8416.073 -8812.255
##
## Clustering table:
## 1 2 3 4 5
## 226 179 630 65 33
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## [1] "Class 4"
## [1] "Class 5"
## NULL
viz2(m6)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 6 components:
##
## log-likelihood n df BIC ICL
## -4099.492 1133 29 -8402.93 -8845.879
##
## Clustering table:
## 1 2 3 4 5 6
## 231 198 554 78 39 33
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## [1] "Class 4"
## [1] "Class 5"
## [1] "Class 6"
## NULL
viz2(m7)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 7 components:
##
## log-likelihood n df BIC ICL
## -4080.037 1133 34 -8399.183 -9003.025
##
## Clustering table:
## 1 2 3 4 5 6 7
## 245 184 175 180 266 50 33
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## [1] "Class 4"
## [1] "Class 5"
## [1] "Class 6"
## [1] "Class 7"
## NULL
viz2(m8)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 8 components:
##
## log-likelihood n df BIC ICL
## -4045.21 1133 39 -8364.693 -8957.662
##
## Clustering table:
## 1 2 3 4 5 6 7 8
## 251 212 131 24 168 264 50 33
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## [1] "Class 4"
## [1] "Class 5"
## [1] "Class 6"
## [1] "Class 7"
## [1] "Class 8"
## NULL
Six-class model seems strongest so far Balances number of classes with entropy Does have two groups with small number of participants but they seem distinct Other fit scores are good
d_std$class <- as.factor(m6$classification)
d_std$pp <- 1 - round(m6$uncertainty, 3)
d_fin <- d_std
d_fin$class_fin <- d_fin$class
# d_fin$class_fin <- "un"
# d_fin$class_fin[d_fin$pp > .69] <- d_fin$class[d_fin$pp > .69]
table(d_fin$class_fin, useNA = "always")
##
## 1 2 3 4 5 6 <NA>
## 231 198 554 78 39 33 0
table(d_fin$class_fin, d_fin$class, useNA = "always")
##
## 1 2 3 4 5 6 <NA>
## 1 231 0 0 0 0 0 0
## 2 0 198 0 0 0 0 0
## 3 0 0 554 0 0 0 0
## 4 0 0 0 78 0 0 0
## 5 0 0 0 0 39 0 0
## 6 0 0 0 0 0 33 0
## <NA> 0 0 0 0 0 0 0
table(d_fin$class_fin, round(d_fin$pp, 1), useNA = "always")
##
## 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 <NA>
## 1 0 2 9 26 21 26 36 111 0
## 2 0 0 11 15 1 17 44 110 0
## 3 1 3 32 53 76 113 234 42 0
## 4 0 2 4 6 11 10 14 31 0
## 5 0 1 4 4 6 13 10 1 0
## 6 0 0 0 0 0 0 7 26 0
## <NA> 0 0 0 0 0 0 0 0 0
df_long <- gather(d_fin, key = "variable", value = "value", s_std, r_std, e_std)
summ <- summarySE(df_long, measurevar="value", groupvars = c("variable","class_fin"), na.rm = T) #the summarySE() command creates a table with the information we need for our plot
# Create a barplot of means for each column by class
ggplot(summ, aes(x = class_fin, y = value, fill = variable)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
geom_errorbar(aes(ymin=value-se, ymax=value+se),
width=.2,
position=position_dodge(.9))
df_long$class_fin <- as.factor(df_long$class_fin)
df_long %>%
ggplot(aes(x=value, fill=class)) +
geom_density(alpha=0.4, bw=.20) +
geom_vline(xintercept=0, size=.5, color="black") +
facet_wrap(~variable) +
xlim(-4.5,2)
vars <- unique(df_long$variable)
n <- 1
for (i in 1:length(vars)) {
print(vars[n])
print(subset(df_long, variable == vars[n]) %>%
ggplot(aes(x=value, fill=class_fin)) +
geom_density(alpha=0.4, bw=.20) +
geom_vline(xintercept=0, size=.5, color="black") +
facet_wrap(~class_fin) +
xlim(-4.5,2))
n <- n + 1
}
## [1] "s_std"
## [1] "r_std"
## [1] "e_std"
n <- 1
for (i in 1:length(unique(df_long$class_fin))) {
df_long$class_spec <- 0
df_long$class_spec[df_long$class_fin == n] <- 1
df_long$class_spec <- as.factor(df_long$class_spec)
scatter <- ggplot(df_long, aes(x = variable, y = value, color = class_spec)) +
geom_jitter()
print(paste("Class",n))
print(scatter)
n <- n + 1
}
## [1] "Class 1"
## [1] "Class 2"
## [1] "Class 3"
## [1] "Class 4"
## [1] "Class 5"
## [1] "Class 6"
n <- 1
for (i in 1:length(unique(na.omit(df_long$class_fin)))) {
pp <- subset(d_fin, select=c(pp), class_fin == n)
density <- plot(density(pp$pp))
print(paste("Class",n))
print(density)
n <- n + 1
}
## [1] "Class 1"
## NULL
## [1] "Class 2"
## NULL
## [1] "Class 3"
## NULL
## [1] "Class 4"
## NULL
## [1] "Class 5"
## NULL
## [1] "Class 6"
## NULL
# classes <- cbind.data.frame(
# subset(d_fin, select=c(X, s_std,r_std,e_std)),
# subset(d_raw, select=c(s,r,e)),
# subset(d_fin, select=c(class,class_fin,pp)))
#
# write.csv(classes, file="classes.csv", row.names = F)