1. Data (Mplus example data set)
bdata<- read.table("ex7.4.dat")
colnames(bdata) <- c("u1", "u2", "u3", "u4","c")
str(bdata)
## 'data.frame': 500 obs. of 5 variables:
## $ u1: num 0 1 0 1 1 0 1 1 0 0 ...
## $ u2: num 1 0 0 1 1 1 1 1 0 0 ...
## $ u3: num 0 1 1 1 1 1 0 1 0 1 ...
## $ u4: num 1 0 1 0 0 0 0 0 0 1 ...
## $ c : int 2 2 1 2 2 1 2 2 1 1 ...
mydata <- bdata %>% dplyr::select(u1,u2,u3,u4)
str(mydata)
## 'data.frame': 500 obs. of 4 variables:
## $ u1: num 0 1 0 1 1 0 1 1 0 0 ...
## $ u2: num 1 0 0 1 1 1 1 1 0 0 ...
## $ u3: num 0 1 1 1 1 1 0 1 0 1 ...
## $ u4: num 1 0 1 0 0 0 0 0 0 1 ...
# poLCA expected lowest categry to be a 1, so adding 1 to each values so it is 1/2
mydata$u1<-mydata$u1+1
mydata$u2<-mydata$u2+1
mydata$u3<-mydata$u3+1
mydata$u4<-mydata$u4+1
str(mydata)
## 'data.frame': 500 obs. of 4 variables:
## $ u1: num 1 2 1 2 2 1 2 2 1 1 ...
## $ u2: num 2 1 1 2 2 2 2 2 1 1 ...
## $ u3: num 1 2 2 2 2 2 1 2 1 2 ...
## $ u4: num 2 1 2 1 1 1 1 1 1 2 ...
2. Run LCA iteratively
# https://statistics.ohlsen-web.de/latent-class-analysis-polca/
# these are the defaults of the poLCA command
#poLCA(formula, data, nclass=2, maxiter=1000, graphs=FALSE, tol=1e-10, na.rm=TRUE, probs.start=NULL, nrep=1, verbose=TRUE, calc.se=TRUE)
f<-with(mydata, cbind(u1,u2,u3,u4)~1)
f
## cbind(u1, u2, u3, u4) ~ 1
## <environment: 0x12b2a2038>
#lc1<-poLCA(f, data=mydata, nclass=1, na.rm = FALSE) #Loglinear independence model.
#lc2<-poLCA(f, data=mydata, nclass=2, na.rm = FALSE)
#lc3<-poLCA(f, data=mydata, nclass=3, na.rm = FALSE)
#------ run a sequence of models with 1-3 classes and print out the model with the lowest BIC
max_II <- -100000
min_bic <- 100000
for(i in 1:3){
lc <- poLCA(f, mydata, nclass=i, maxiter=9000,
tol=1e-5, na.rm=FALSE,
nrep=10, verbose=TRUE, calc.se=TRUE)
if(lc$bic < min_bic){
min_bic <- lc$bic
LCA_best_model<-lc
}
}
## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.5 0.5
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.502 0.498
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.504 0.496
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.554 0.446
##
## Estimated class population shares
## 1
##
## Predicted class memberships (by modal posterior prob.)
## 1
##
## =========================================================
## Fit for 1 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 4
## residual degrees of freedom: 11
## maximum log-likelihood: -1383.353
##
## AIC(1): 2774.705
## BIC(1): 2791.564
## G^2(1): 129.0224 (Likelihood ratio/deviance statistic)
## X^2(1): 144.6614 (Chi-square goodness of fit)
##
## Model 1: llik = -1325.213 ... best llik = -1325.213
## Model 2: llik = -1325.213 ... best llik = -1325.213
## Model 3: llik = -1325.213 ... best llik = -1325.213
## Model 4: llik = -1325.213 ... best llik = -1325.213
## Model 5: llik = -1325.213 ... best llik = -1325.213
## Model 6: llik = -1325.213 ... best llik = -1325.213
## Model 7: llik = -1325.213 ... best llik = -1325.213
## Model 8: llik = -1325.213 ... best llik = -1325.213
## Model 9: llik = -1325.213 ... best llik = -1325.213
## Model 10: llik = -1325.213 ... best llik = -1325.213
## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.2160 0.7840
## class 2: 0.7719 0.2281
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.2459 0.7541
## class 2: 0.7472 0.2528
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.7296 0.2704
## class 2: 0.2881 0.7119
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.7406 0.2594
## class 2: 0.3754 0.6246
##
## Estimated class population shares
## 0.4891 0.5109
##
## Predicted class memberships (by modal posterior prob.)
## 0.456 0.544
##
## =========================================================
## Fit for 2 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 9
## residual degrees of freedom: 6
## maximum log-likelihood: -1325.213
##
## AIC(2): 2668.426
## BIC(2): 2706.357
## G^2(2): 12.7426 (Likelihood ratio/deviance statistic)
## X^2(2): 12.60994 (Chi-square goodness of fit)
##
## Model 1: llik = -1320.351 ... best llik = -1320.351
## Model 2: llik = -1320.351 ... best llik = -1320.351
## Model 3: llik = -1321.814 ... best llik = -1320.351
## Model 4: llik = -1320.351 ... best llik = -1320.351
## Model 5: llik = -1320.351 ... best llik = -1320.351
## Model 6: llik = -1320.351 ... best llik = -1320.351
## Model 7: llik = -1321.814 ... best llik = -1320.351
## Model 8: llik = -1320.351 ... best llik = -1320.351
## Model 9: llik = -1320.351 ... best llik = -1320.351
## Model 10: llik = -1320.351 ... best llik = -1320.351
## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.0011 0.9989
## class 2: 0.2581 0.7419
## class 3: 0.7237 0.2763
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.0102 0.9898
## class 2: 0.2781 0.7219
## class 3: 0.7138 0.2862
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.9999 0.0001
## class 2: 0.6715 0.3285
## class 3: 0.3253 0.6747
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.2568 0.7432
## class 2: 1.0000 0.0000
## class 3: 0.3343 0.6657
##
## Estimated class population shares
## 0.0901 0.3406 0.5693
##
## Predicted class memberships (by modal posterior prob.)
## 0.076 0.336 0.588
##
## =========================================================
## Fit for 3 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 14
## residual degrees of freedom: 1
## maximum log-likelihood: -1320.351
##
## AIC(3): 2668.702
## BIC(3): 2727.706
## G^2(3): 3.018618 (Likelihood ratio/deviance statistic)
## X^2(3): 3.064368 (Chi-square goodness of fit)
##
LCA_best_model
## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.2160 0.7840
## class 2: 0.7719 0.2281
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.2459 0.7541
## class 2: 0.7472 0.2528
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.7296 0.2704
## class 2: 0.2881 0.7119
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.7406 0.2594
## class 2: 0.3754 0.6246
##
## Estimated class population shares
## 0.4891 0.5109
##
## Predicted class memberships (by modal posterior prob.)
## 0.456 0.544
##
## =========================================================
## Fit for 2 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 9
## residual degrees of freedom: 6
## maximum log-likelihood: -1325.213
##
## AIC(2): 2668.426
## BIC(2): 2706.357
## G^2(2): 12.7426 (Likelihood ratio/deviance statistic)
## X^2(2): 12.60994 (Chi-square goodness of fit)
##
3. Use seed for reproducibility results - seeds could be
anything
## models using seed to get reproducible results each time:
#visualize how people will respond to each class membership and mirror image of item endorsemetn
#less people are to endorse 1 and 2 in a 3 factor solution
set.seed(052221)
lc1<-poLCA(f, data=mydata, nclass=1, na.rm = FALSE, nrep=10, maxiter=3000) #Loglinear independence model.
## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.5 0.5
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.502 0.498
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.504 0.496
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.554 0.446
##
## Estimated class population shares
## 1
##
## Predicted class memberships (by modal posterior prob.)
## 1
##
## =========================================================
## Fit for 1 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 4
## residual degrees of freedom: 11
## maximum log-likelihood: -1383.353
##
## AIC(1): 2774.705
## BIC(1): 2791.564
## G^2(1): 129.0224 (Likelihood ratio/deviance statistic)
## X^2(1): 144.6614 (Chi-square goodness of fit)
##
lc2<-poLCA(f, data=mydata, nclass=2, na.rm = FALSE, nrep=10, maxiter=3000, graph=T)
## Model 1: llik = -1325.213 ... best llik = -1325.213
## Model 2: llik = -1325.213 ... best llik = -1325.213
## Model 3: llik = -1325.213 ... best llik = -1325.213
## Model 4: llik = -1325.213 ... best llik = -1325.213
## Model 5: llik = -1325.213 ... best llik = -1325.213
## Model 6: llik = -1325.213 ... best llik = -1325.213
## Model 7: llik = -1325.213 ... best llik = -1325.213
## Model 8: llik = -1325.213 ... best llik = -1325.213
## Model 9: llik = -1325.213 ... best llik = -1325.213
## Model 10: llik = -1325.213 ... best llik = -1325.213

## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.7722 0.2278
## class 2: 0.2164 0.7836
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.7476 0.2524
## class 2: 0.2462 0.7538
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.2878 0.7122
## class 2: 0.7292 0.2708
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.3750 0.6250
## class 2: 0.7404 0.2596
##
## Estimated class population shares
## 0.5102 0.4898
##
## Predicted class memberships (by modal posterior prob.)
## 0.544 0.456
##
## =========================================================
## Fit for 2 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 9
## residual degrees of freedom: 6
## maximum log-likelihood: -1325.213
##
## AIC(2): 2668.425
## BIC(2): 2706.357
## G^2(2): 12.74247 (Likelihood ratio/deviance statistic)
## X^2(2): 12.61124 (Chi-square goodness of fit)
##
lc3<-poLCA(f, data=mydata, nclass=3, na.rm = FALSE, nrep=10, maxiter=10000, graph=T)
## Model 1: llik = -1320.349 ... best llik = -1320.349
## Model 2: llik = -1320.349 ... best llik = -1320.349
## Model 3: llik = -1320.349 ... best llik = -1320.349
## Model 4: llik = -1321.813 ... best llik = -1320.349
## Model 5: llik = -1320.349 ... best llik = -1320.349
## Model 6: llik = -1320.349 ... best llik = -1320.349
## Model 7: llik = -1320.349 ... best llik = -1320.349
## Model 8: llik = -1320.349 ... best llik = -1320.349
## Model 9: llik = -1321.813 ... best llik = -1320.349
## Model 10: llik = -1320.349 ... best llik = -1320.349

## Conditional item response (column) probabilities,
## by outcome variable, for each class (row)
##
## $u1
## Pr(1) Pr(2)
## class 1: 0.7227 0.2773
## class 2: 0.0000 1.0000
## class 3: 0.2512 0.7488
##
## $u2
## Pr(1) Pr(2)
## class 1: 0.7130 0.2870
## class 2: 0.0035 0.9965
## class 3: 0.2724 0.7276
##
## $u3
## Pr(1) Pr(2)
## class 1: 0.3261 0.6739
## class 2: 1.0000 0.0000
## class 3: 0.6787 0.3213
##
## $u4
## Pr(1) Pr(2)
## class 1: 0.3374 0.6626
## class 2: 0.2188 0.7812
## class 3: 1.0000 0.0000
##
## Estimated class population shares
## 0.5729 0.085 0.3421
##
## Predicted class memberships (by modal posterior prob.)
## 0.588 0.076 0.336
##
## =========================================================
## Fit for 3 latent classes:
## =========================================================
## number of observations: 500
## number of estimated parameters: 14
## residual degrees of freedom: 1
## maximum log-likelihood: -1320.349
##
## AIC(3): 2668.698
## BIC(3): 2727.703
## G^2(3): 3.015552 (Likelihood ratio/deviance statistic)
## X^2(3): 3.069606 (Chi-square goodness of fit)
##
4. Table of Fits
# generate dataframe with fit-values
# See https://statistics.ohlsen-web.de/latent-class-analysis-polca/
#code and calucate the entropy by hand
results <- data.frame(Modell=c("Model 1"),
log_likelihood=lc1$llik,
df = lc1$resid.df,
BIC=lc1$bic)
results$Modell<-as.integer(results$Modell)
## Warning: NAs introduced by coercion
results[1,1]<-c("Model 1")
results[2,1]<-c("Model 2")
results[3,1]<-c("Model 3")
results[2,2]<-lc2$llik
results[3,2]<-lc3$llik
results[2,3]<-lc2$resid.df
results[3,3]<-lc3$resid.df
results[2,4]<-lc2$bic
results[3,4]<-lc3$bic
entropy<-function (p) sum(-p*log(p))
results$R2_entropy
## NULL
results[1,5]<-c("-")
error_prior<-entropy(lc2$P) # class proportions model 2
error_post<-mean(apply(lc2$posterior,1, entropy),na.rm = TRUE)
results[2,5]<-round(((error_prior-error_post) / error_prior),3)
error_prior<-entropy(lc3$P) # class proportions model 3
error_post<-mean(apply(lc3$posterior,1, entropy),na.rm = TRUE)
results[3,5]<-round(((error_prior-error_post) / error_prior),3)
# combining results to a dataframe
colnames(results)<-c("Model","log-likelihood","resid. df","BIC","Entropy")
lca_results<-results
lca_results
## Model log-likelihood resid. df BIC Entropy
## 1 Model 1 -1383.353 11 2791.564 -
## 2 Model 2 -1325.213 6 2706.357 0.503
## 3 Model 3 -1320.349 1 2727.703 0.644
#Note. MPlus gives higher entropy but otherwise similar results (.701)?!
5. Class memberships
#2 class
#population-shares of the classes, in %
round(colMeans(lc2$posterior)*100,2)
## [1] 51.02 48.98
# estimated class memberships
table(lc2$predclass)
##
## 1 2
## 272 228
#3 class
#population-shares of the classes, in %
round(colMeans(lc3$posterior)*100,2)
## [1] 57.29 8.50 34.21
# estimated class memberships
table(lc3$predclass)
##
## 1 2 3
## 294 38 168