At present, MPlus is a widely-used tool to carry out Latent Profile Analysis, and there does not seem to be a widely-accepted or used way to carry out Latent Profile Analysis in R. This compares output from MPlus to output from the R package MCLUST, which is accessed through the package tidymixmod which I started to work on to make estimating common models used for LPA easier to carry out using MCLUST.
First, let’s load a few packages and setup.
library(tidyverse)
library(MplusAutomation)
library(stringr)
# if tidymixmod is not installed, can use the line below to install
# devtools::install_github("jrosen48/tidymixmod")
library(tidymixmod)
the_dir <- "/Users/joshuarosenberg/Dropbox/5_Professional/homepage/source/static/_media/data/"
Next, we’ll write a few functions to view MPlus summary statistics from the output.
extract_mplus_summary <- function(x) {
x$summaries[c("LL", "BIC", "Entropy")]
}
extract_mplus_paramaters <- function(x) {
y <- x$parameters[[1]]
y <- y[-nrow(y), ]
dplyr::select(y, param_name = paramHeader, var_name = param, est, se, class = LatentClass)
}
Here we are running MPlus models using MPlusAutomation. The specification of each of the three models is available from the post here.
runModels(paste0(the_dir, "2-iris-LPA_means.inp"))
runModels(paste0(the_dir, "2-iris-LPA_means_correlated.inp"))
runModels(paste0(the_dir, "2-iris-LPA_means_correlated_free_variances.inp"))
m1_mplus_out <- readModels(paste0(the_dir, "2-iris-LPA_means.out"))
m2_mplus_out <- readModels(paste0(the_dir, "2-iris-LPA_means_correlated.out"))
m3_mplus_out <- readModels(paste0(the_dir, "2-iris-LPA_means_correlated_free_variances.out"))
m1_mplus <- m1_mplus_out %>%
extract_mplus_paramaters() %>%
mutate(class = paste0("class_", class)) %>%
select(-se) %>%
spread(class, est)
m2_mplus <- m2_mplus_out %>%
extract_mplus_paramaters() %>%
mutate(class = paste0("class_", class)) %>%
select(-se) %>%
spread(class, est)
m3_mplus <- m3_mplus_out %>%
extract_mplus_paramaters() %>%
mutate(class = paste0("class_", class)) %>%
select(-se) %>%
spread(class, est)
Here we are running MCLUST models using the R package tidymixmod, which interfaces to the R package MCLUST.
m1_mclust <- create_profiles_mclust(iris[, -5], n_profiles = 2, model_name = "constrained_variance", to_return = "mclust")
m2_mclust <- create_profiles_mclust(iris[, -5], n_profiles = 2, model_name = "constrained_variance_and_covariance", to_return = "mclust")
m3_mclust <- create_profiles_mclust(iris[, -5], n_profiles = 2, model_name = "freed_variance_and_covariance", to_return = "mclust")
LL is log-likelihood
list(m1_mplus_out, m2_mplus_out, m3_mplus_out) %>%
map_df(extract_mplus_summary) %>%
mutate(model = paste0("m", 1:3),
method = "MPlus") %>%
select(method, model, everything())
## method model LL BIC Entropy
## 1 MPlus m1 -488.915 1042.968 0.991
## 2 MPlus m2 -296.448 688.097 1.000
## 3 MPlus m3 -214.355 574.018 1.000
list(m1_mclust, m2_mclust, m3_mclust) %>%
map_df(extract_mclust_summary) %>%
mutate(model = paste0("m", 1:3),
method = "MCLUST") %>%
select(method, model, everything())
## method model LL BIC Entropy
## 1 MCLUST m1 -488.915 1042.968 0.998
## 2 MCLUST m2 -296.448 688.097 1.000
## 3 MCLUST m3 -214.355 574.018 1.000
There is a slight difference in the entropy statistic for the first model (means with constrained variances), possibly due to a rounding error from the MPlus output.
m1_m <- extract_means(m1_mclust) %>%
mutate(class_1 = round(class_1, 3),
class_2 = round(class_2, 3))
m1_c1_v <- extract_variance(m1_mclust, 1)
m1_c2_v <- extract_variance(m1_mclust, 2)
# MPlus output
m1_mplus %>%
mutate(model = "MPlus") %>%
select(model, everything()) %>%
arrange(param_name, desc(class_1))
## model param_name var_name class_1 class_2
## 1 MPlus Means SEPAL_LENG 5.006 6.265
## 2 MPlus Means SEPAL_WIDT 3.423 2.873
## 3 MPlus Means PETAL_LENG 1.471 4.911
## 4 MPlus Means PETAL_WIDT 0.250 1.678
## 5 MPlus Variances PETAL_LENG 0.459 0.459
## 6 MPlus Variances SEPAL_LENG 0.328 0.328
## 7 MPlus Variances PETAL_WIDT 0.123 0.123
## 8 MPlus Variances SEPAL_WIDT 0.121 0.121
# MCLUST output
bind_rows(m1_c1_v, m1_c2_v) %>%
spread(class, est) %>%
bind_rows(m1_m) %>%
mutate(model = "MCLUST") %>%
select(model, everything()) %>%
arrange(param_name, desc(class_1)) %>%
mutate(var_name = toupper(var_name),
var_name = str_sub(var_name, end = 10),
var_name = str_replace(var_name, "\\.", "_"))
## # A tibble: 8 x 5
## model param_name var_name class_1 class_2
## <chr> <chr> <chr> <dbl> <dbl>
## 1 MCLUST Means SEPAL_LENG 5.006 6.265
## 2 MCLUST Means SEPAL_WIDT 3.423 2.873
## 3 MCLUST Means PETAL_LENG 1.471 4.911
## 4 MCLUST Means PETAL_WIDT 0.250 1.678
## 5 MCLUST Variances PETAL_LENG 0.459 0.459
## 6 MCLUST Variances SEPAL_LENG 0.328 0.328
## 7 MCLUST Variances PETAL_WIDT 0.123 0.123
## 8 MCLUST Variances SEPAL_WIDT 0.121 0.121
These seem to be identical.
m2_m <- extract_means(m2_mclust) %>%
mutate(class_1 = round(class_1, 3),
class_2 = round(class_2, 3))
m2_c1_v <- extract_variance(m2_mclust, 1)
m2_c2_v <- extract_variance(m2_mclust, 2)
m2_c1_cv <- extract_covariance(m2_mclust, 1) %>%
semi_join(m2_mplus, by = c("param_name", "var_name")) %>%
mutate(class = "class_1")
m2_c2_cv <- extract_covariance(m2_mclust, 2) %>%
semi_join(m2_mplus, by = c("param_name", "var_name")) %>%
mutate(class = "class_2")
m2_cv <- bind_rows(m2_c1_cv, m2_c2_cv) %>%
mutate(est = round(est, 3),
model = "MCLUST") %>%
spread(class, est)
# MPlus output
m2_mplus %>%
mutate(model = "MPlus") %>%
select(model, everything()) %>%
arrange(param_name, desc(class_1))
## model param_name var_name class_1 class_2
## 1 MPlus Means SEPAL_LENG 5.006 6.262
## 2 MPlus Means SEPAL_WIDT 3.428 2.872
## 3 MPlus Means PETAL_LENG 1.462 4.906
## 4 MPlus Means PETAL_WIDT 0.246 1.676
## 5 MPlus PETAL_LE.WITH PETAL_WIDT 0.193 0.193
## 6 MPlus SEPAL_LE.WITH PETAL_LENG 0.305 0.305
## 7 MPlus SEPAL_LE.WITH PETAL_WIDT 0.114 0.114
## 8 MPlus SEPAL_LE.WITH SEPAL_WIDT 0.113 0.113
## 9 MPlus SEPAL_WI.WITH PETAL_LENG 0.098 0.098
## 10 MPlus SEPAL_WI.WITH PETAL_WIDT 0.056 0.056
## 11 MPlus Variances PETAL_LENG 0.460 0.460
## 12 MPlus Variances SEPAL_LENG 0.331 0.331
## 13 MPlus Variances PETAL_WIDT 0.123 0.123
## 14 MPlus Variances SEPAL_WIDT 0.120 0.120
# MCLUST output
bind_rows(m2_c1_v, m2_c2_v) %>%
spread(class, est) %>%
bind_rows(m1_m) %>%
mutate(model = "MCLUST") %>%
select(model, everything()) %>%
bind_rows(m2_cv) %>%
arrange(param_name) %>%
arrange(param_name, desc(class_1)) %>%
mutate(var_name = toupper(var_name),
var_name = str_sub(var_name, end = 10),
var_name = str_replace(var_name, "\\.", "_"))
## # A tibble: 14 x 5
## model param_name var_name class_1 class_2
## <chr> <chr> <chr> <dbl> <dbl>
## 1 MCLUST Means SEPAL_LENG 5.006 6.265
## 2 MCLUST Means SEPAL_WIDT 3.423 2.873
## 3 MCLUST Means PETAL_LENG 1.471 4.911
## 4 MCLUST Means PETAL_WIDT 0.250 1.678
## 5 MCLUST PETAL_LE.WITH PETAL_WIDT 0.193 0.193
## 6 MCLUST SEPAL_LE.WITH PETAL_LENG 0.305 0.305
## 7 MCLUST SEPAL_LE.WITH PETAL_WIDT 0.114 0.114
## 8 MCLUST SEPAL_LE.WITH SEPAL_WIDT 0.113 0.113
## 9 MCLUST SEPAL_WI.WITH PETAL_LENG 0.098 0.098
## 10 MCLUST SEPAL_WI.WITH PETAL_WIDT 0.056 0.056
## 11 MCLUST Variances PETAL_LENG 0.460 0.460
## 12 MCLUST Variances SEPAL_LENG 0.331 0.331
## 13 MCLUST Variances PETAL_WIDT 0.123 0.123
## 14 MCLUST Variances SEPAL_WIDT 0.120 0.120
There seem to be a few differences in the hundreths place.
m3_m <- extract_means(m3_mclust) %>%
mutate(class_1 = round(class_1, 3),
class_2 = round(class_2, 3))
m3_c1_v <- extract_variance(m3_mclust, 1)
m3_c2_v <- extract_variance(m3_mclust, 2)
m3_c1_cv <- extract_covariance(m3_mclust, 1) %>%
semi_join(m3_mplus, by = c("param_name", "var_name")) %>%
mutate(class = "class_1")
m3_c2_cv <- extract_covariance(m3_mclust, 2) %>%
semi_join(m3_mplus, by = c("param_name", "var_name")) %>%
mutate(class = "class_2")
m3_cv <- bind_rows(m3_c1_cv, m3_c2_cv) %>%
mutate(est = round(est, 3),
model = "MCLUST") %>%
spread(class, est)
# MPlus output
m3_mplus %>%
mutate(model = "MPlus") %>%
select(model, everything()) %>%
arrange(param_name, desc(class_1))
## model param_name var_name class_1 class_2
## 1 MPlus Means SEPAL_LENG 5.006 6.262
## 2 MPlus Means SEPAL_WIDT 3.428 2.872
## 3 MPlus Means PETAL_LENG 1.462 4.906
## 4 MPlus Means PETAL_WIDT 0.246 1.676
## 5 MPlus PETAL_LE.WITH PETAL_WIDT 0.006 0.286
## 6 MPlus SEPAL_LE.WITH SEPAL_WIDT 0.097 0.121
## 7 MPlus SEPAL_LE.WITH PETAL_LENG 0.016 0.449
## 8 MPlus SEPAL_LE.WITH PETAL_WIDT 0.010 0.166
## 9 MPlus SEPAL_WI.WITH PETAL_LENG 0.011 0.141
## 10 MPlus SEPAL_WI.WITH PETAL_WIDT 0.009 0.079
## 11 MPlus Variances SEPAL_WIDT 0.141 0.110
## 12 MPlus Variances SEPAL_LENG 0.122 0.435
## 13 MPlus Variances PETAL_LENG 0.030 0.675
## 14 MPlus Variances PETAL_WIDT 0.011 0.179
# MCLUST output
bind_rows(m1_c1_v, m1_c2_v) %>%
spread(class, est) %>%
bind_rows(m1_m) %>%
mutate(model = "MCLUST") %>%
select(model, everything()) %>%
bind_rows(m3_cv) %>%
arrange(param_name) %>%
arrange(param_name, desc(class_1)) %>%
mutate(var_name = toupper(var_name),
var_name = str_sub(var_name, end = 10),
var_name = str_replace(var_name, "\\.", "_"))
## # A tibble: 14 x 5
## model param_name var_name class_1 class_2
## <chr> <chr> <chr> <dbl> <dbl>
## 1 MCLUST Means SEPAL_LENG 5.006 6.265
## 2 MCLUST Means SEPAL_WIDT 3.423 2.873
## 3 MCLUST Means PETAL_LENG 1.471 4.911
## 4 MCLUST Means PETAL_WIDT 0.250 1.678
## 5 MCLUST PETAL_LE.WITH PETAL_WIDT 0.006 0.286
## 6 MCLUST SEPAL_LE.WITH SEPAL_WIDT 0.097 0.121
## 7 MCLUST SEPAL_LE.WITH PETAL_LENG 0.016 0.449
## 8 MCLUST SEPAL_LE.WITH PETAL_WIDT 0.010 0.166
## 9 MCLUST SEPAL_WI.WITH PETAL_LENG 0.011 0.141
## 10 MCLUST SEPAL_WI.WITH PETAL_WIDT 0.009 0.079
## 11 MCLUST Variances PETAL_LENG 0.459 0.459
## 12 MCLUST Variances SEPAL_LENG 0.328 0.328
## 13 MCLUST Variances PETAL_WIDT 0.123 0.123
## 14 MCLUST Variances SEPAL_WIDT 0.121 0.121
These seem to be a bit different at the tenths or hundreths decimal point.
At least for these simple models, the output appears to be identical, with slight differences in the entropy statistic and some of the parameter estimates for the third models.
More complex models may be different:
Future directions include: