1. Data (Mplus example data set)

bdata<- read.table("ex7.4.dat")
colnames(bdata) <- c("u1", "u2", "u3", "u4","c")
str(bdata)
## 'data.frame':    500 obs. of  5 variables:
##  $ u1: num  0 1 0 1 1 0 1 1 0 0 ...
##  $ u2: num  1 0 0 1 1 1 1 1 0 0 ...
##  $ u3: num  0 1 1 1 1 1 0 1 0 1 ...
##  $ u4: num  1 0 1 0 0 0 0 0 0 1 ...
##  $ c : int  2 2 1 2 2 1 2 2 1 1 ...
mydata <- bdata %>% dplyr::select(u1,u2,u3,u4)
str(mydata)
## 'data.frame':    500 obs. of  4 variables:
##  $ u1: num  0 1 0 1 1 0 1 1 0 0 ...
##  $ u2: num  1 0 0 1 1 1 1 1 0 0 ...
##  $ u3: num  0 1 1 1 1 1 0 1 0 1 ...
##  $ u4: num  1 0 1 0 0 0 0 0 0 1 ...
# poLCA expected lowest categry to be a 1, so adding 1 to each values so it is 1/2
mydata$u1<-mydata$u1+1
mydata$u2<-mydata$u2+1
mydata$u3<-mydata$u3+1
mydata$u4<-mydata$u4+1
str(mydata)
## 'data.frame':    500 obs. of  4 variables:
##  $ u1: num  1 2 1 2 2 1 2 2 1 1 ...
##  $ u2: num  2 1 1 2 2 2 2 2 1 1 ...
##  $ u3: num  1 2 2 2 2 2 1 2 1 2 ...
##  $ u4: num  2 1 2 1 1 1 1 1 1 2 ...

2. Run LCA iteratively

# https://statistics.ohlsen-web.de/latent-class-analysis-polca/
# these are the defaults of the poLCA command
#poLCA(formula, data, nclass=2, maxiter=1000, graphs=FALSE, tol=1e-10, na.rm=TRUE, probs.start=NULL, nrep=1, verbose=TRUE, calc.se=TRUE)
f<-with(mydata, cbind(u1,u2,u3,u4)~1) 
f
## cbind(u1, u2, u3, u4) ~ 1
## <environment: 0x12b2a2038>
#lc1<-poLCA(f, data=mydata, nclass=1, na.rm = FALSE) #Loglinear independence model.
#lc2<-poLCA(f, data=mydata, nclass=2, na.rm = FALSE)
#lc3<-poLCA(f, data=mydata, nclass=3, na.rm = FALSE)

#------ run a sequence of models with 1-3 classes and print out the model with the lowest BIC
max_II <- -100000
min_bic <- 100000
for(i in 1:3){
  lc <- poLCA(f, mydata, nclass=i, maxiter=9000, 
              tol=1e-5, na.rm=FALSE,  
              nrep=10, verbose=TRUE, calc.se=TRUE)
  if(lc$bic < min_bic){
    min_bic <- lc$bic
    LCA_best_model<-lc
  }
}       
## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##           Pr(1) Pr(2)
## class 1:    0.5   0.5
## 
## $u2
##           Pr(1) Pr(2)
## class 1:  0.502 0.498
## 
## $u3
##           Pr(1) Pr(2)
## class 1:  0.504 0.496
## 
## $u4
##           Pr(1) Pr(2)
## class 1:  0.554 0.446
## 
## Estimated class population shares 
##  1 
##  
## Predicted class memberships (by modal posterior prob.) 
##  1 
##  
## ========================================================= 
## Fit for 1 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 4 
## residual degrees of freedom: 11 
## maximum log-likelihood: -1383.353 
##  
## AIC(1): 2774.705
## BIC(1): 2791.564
## G^2(1): 129.0224 (Likelihood ratio/deviance statistic) 
## X^2(1): 144.6614 (Chi-square goodness of fit) 
##  
## Model 1: llik = -1325.213 ... best llik = -1325.213
## Model 2: llik = -1325.213 ... best llik = -1325.213
## Model 3: llik = -1325.213 ... best llik = -1325.213
## Model 4: llik = -1325.213 ... best llik = -1325.213
## Model 5: llik = -1325.213 ... best llik = -1325.213
## Model 6: llik = -1325.213 ... best llik = -1325.213
## Model 7: llik = -1325.213 ... best llik = -1325.213
## Model 8: llik = -1325.213 ... best llik = -1325.213
## Model 9: llik = -1325.213 ... best llik = -1325.213
## Model 10: llik = -1325.213 ... best llik = -1325.213
## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##            Pr(1)  Pr(2)
## class 1:  0.2160 0.7840
## class 2:  0.7719 0.2281
## 
## $u2
##            Pr(1)  Pr(2)
## class 1:  0.2459 0.7541
## class 2:  0.7472 0.2528
## 
## $u3
##            Pr(1)  Pr(2)
## class 1:  0.7296 0.2704
## class 2:  0.2881 0.7119
## 
## $u4
##            Pr(1)  Pr(2)
## class 1:  0.7406 0.2594
## class 2:  0.3754 0.6246
## 
## Estimated class population shares 
##  0.4891 0.5109 
##  
## Predicted class memberships (by modal posterior prob.) 
##  0.456 0.544 
##  
## ========================================================= 
## Fit for 2 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 9 
## residual degrees of freedom: 6 
## maximum log-likelihood: -1325.213 
##  
## AIC(2): 2668.426
## BIC(2): 2706.357
## G^2(2): 12.7426 (Likelihood ratio/deviance statistic) 
## X^2(2): 12.60994 (Chi-square goodness of fit) 
##  
## Model 1: llik = -1320.351 ... best llik = -1320.351
## Model 2: llik = -1320.351 ... best llik = -1320.351
## Model 3: llik = -1321.814 ... best llik = -1320.351
## Model 4: llik = -1320.351 ... best llik = -1320.351
## Model 5: llik = -1320.351 ... best llik = -1320.351
## Model 6: llik = -1320.351 ... best llik = -1320.351
## Model 7: llik = -1321.814 ... best llik = -1320.351
## Model 8: llik = -1320.351 ... best llik = -1320.351
## Model 9: llik = -1320.351 ... best llik = -1320.351
## Model 10: llik = -1320.351 ... best llik = -1320.351
## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##            Pr(1)  Pr(2)
## class 1:  0.0011 0.9989
## class 2:  0.2581 0.7419
## class 3:  0.7237 0.2763
## 
## $u2
##            Pr(1)  Pr(2)
## class 1:  0.0102 0.9898
## class 2:  0.2781 0.7219
## class 3:  0.7138 0.2862
## 
## $u3
##            Pr(1)  Pr(2)
## class 1:  0.9999 0.0001
## class 2:  0.6715 0.3285
## class 3:  0.3253 0.6747
## 
## $u4
##            Pr(1)  Pr(2)
## class 1:  0.2568 0.7432
## class 2:  1.0000 0.0000
## class 3:  0.3343 0.6657
## 
## Estimated class population shares 
##  0.0901 0.3406 0.5693 
##  
## Predicted class memberships (by modal posterior prob.) 
##  0.076 0.336 0.588 
##  
## ========================================================= 
## Fit for 3 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 14 
## residual degrees of freedom: 1 
## maximum log-likelihood: -1320.351 
##  
## AIC(3): 2668.702
## BIC(3): 2727.706
## G^2(3): 3.018618 (Likelihood ratio/deviance statistic) 
## X^2(3): 3.064368 (Chi-square goodness of fit) 
## 
LCA_best_model
## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##            Pr(1)  Pr(2)
## class 1:  0.2160 0.7840
## class 2:  0.7719 0.2281
## 
## $u2
##            Pr(1)  Pr(2)
## class 1:  0.2459 0.7541
## class 2:  0.7472 0.2528
## 
## $u3
##            Pr(1)  Pr(2)
## class 1:  0.7296 0.2704
## class 2:  0.2881 0.7119
## 
## $u4
##            Pr(1)  Pr(2)
## class 1:  0.7406 0.2594
## class 2:  0.3754 0.6246
## 
## Estimated class population shares 
##  0.4891 0.5109 
##  
## Predicted class memberships (by modal posterior prob.) 
##  0.456 0.544 
##  
## ========================================================= 
## Fit for 2 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 9 
## residual degrees of freedom: 6 
## maximum log-likelihood: -1325.213 
##  
## AIC(2): 2668.426
## BIC(2): 2706.357
## G^2(2): 12.7426 (Likelihood ratio/deviance statistic) 
## X^2(2): 12.60994 (Chi-square goodness of fit) 
## 

3. Use seed for reproducibility results - seeds could be anything

## models using seed to get reproducible results each time:
#visualize how people will respond to each class membership and mirror image of item endorsemetn 
#less people are to endorse 1 and 2 in a 3 factor solution 
set.seed(052221)
lc1<-poLCA(f, data=mydata, nclass=1, na.rm = FALSE, nrep=10, maxiter=3000) #Loglinear independence model.
## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##           Pr(1) Pr(2)
## class 1:    0.5   0.5
## 
## $u2
##           Pr(1) Pr(2)
## class 1:  0.502 0.498
## 
## $u3
##           Pr(1) Pr(2)
## class 1:  0.504 0.496
## 
## $u4
##           Pr(1) Pr(2)
## class 1:  0.554 0.446
## 
## Estimated class population shares 
##  1 
##  
## Predicted class memberships (by modal posterior prob.) 
##  1 
##  
## ========================================================= 
## Fit for 1 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 4 
## residual degrees of freedom: 11 
## maximum log-likelihood: -1383.353 
##  
## AIC(1): 2774.705
## BIC(1): 2791.564
## G^2(1): 129.0224 (Likelihood ratio/deviance statistic) 
## X^2(1): 144.6614 (Chi-square goodness of fit) 
## 
lc2<-poLCA(f, data=mydata, nclass=2, na.rm = FALSE, nrep=10, maxiter=3000, graph=T)
## Model 1: llik = -1325.213 ... best llik = -1325.213
## Model 2: llik = -1325.213 ... best llik = -1325.213
## Model 3: llik = -1325.213 ... best llik = -1325.213
## Model 4: llik = -1325.213 ... best llik = -1325.213
## Model 5: llik = -1325.213 ... best llik = -1325.213
## Model 6: llik = -1325.213 ... best llik = -1325.213
## Model 7: llik = -1325.213 ... best llik = -1325.213
## Model 8: llik = -1325.213 ... best llik = -1325.213
## Model 9: llik = -1325.213 ... best llik = -1325.213
## Model 10: llik = -1325.213 ... best llik = -1325.213

## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##            Pr(1)  Pr(2)
## class 1:  0.7722 0.2278
## class 2:  0.2164 0.7836
## 
## $u2
##            Pr(1)  Pr(2)
## class 1:  0.7476 0.2524
## class 2:  0.2462 0.7538
## 
## $u3
##            Pr(1)  Pr(2)
## class 1:  0.2878 0.7122
## class 2:  0.7292 0.2708
## 
## $u4
##            Pr(1)  Pr(2)
## class 1:  0.3750 0.6250
## class 2:  0.7404 0.2596
## 
## Estimated class population shares 
##  0.5102 0.4898 
##  
## Predicted class memberships (by modal posterior prob.) 
##  0.544 0.456 
##  
## ========================================================= 
## Fit for 2 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 9 
## residual degrees of freedom: 6 
## maximum log-likelihood: -1325.213 
##  
## AIC(2): 2668.425
## BIC(2): 2706.357
## G^2(2): 12.74247 (Likelihood ratio/deviance statistic) 
## X^2(2): 12.61124 (Chi-square goodness of fit) 
## 
lc3<-poLCA(f, data=mydata, nclass=3, na.rm = FALSE, nrep=10, maxiter=10000, graph=T)
## Model 1: llik = -1320.349 ... best llik = -1320.349
## Model 2: llik = -1320.349 ... best llik = -1320.349
## Model 3: llik = -1320.349 ... best llik = -1320.349
## Model 4: llik = -1321.813 ... best llik = -1320.349
## Model 5: llik = -1320.349 ... best llik = -1320.349
## Model 6: llik = -1320.349 ... best llik = -1320.349
## Model 7: llik = -1320.349 ... best llik = -1320.349
## Model 8: llik = -1320.349 ... best llik = -1320.349
## Model 9: llik = -1321.813 ... best llik = -1320.349
## Model 10: llik = -1320.349 ... best llik = -1320.349

## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $u1
##            Pr(1)  Pr(2)
## class 1:  0.7227 0.2773
## class 2:  0.0000 1.0000
## class 3:  0.2512 0.7488
## 
## $u2
##            Pr(1)  Pr(2)
## class 1:  0.7130 0.2870
## class 2:  0.0035 0.9965
## class 3:  0.2724 0.7276
## 
## $u3
##            Pr(1)  Pr(2)
## class 1:  0.3261 0.6739
## class 2:  1.0000 0.0000
## class 3:  0.6787 0.3213
## 
## $u4
##            Pr(1)  Pr(2)
## class 1:  0.3374 0.6626
## class 2:  0.2188 0.7812
## class 3:  1.0000 0.0000
## 
## Estimated class population shares 
##  0.5729 0.085 0.3421 
##  
## Predicted class memberships (by modal posterior prob.) 
##  0.588 0.076 0.336 
##  
## ========================================================= 
## Fit for 3 latent classes: 
## ========================================================= 
## number of observations: 500 
## number of estimated parameters: 14 
## residual degrees of freedom: 1 
## maximum log-likelihood: -1320.349 
##  
## AIC(3): 2668.698
## BIC(3): 2727.703
## G^2(3): 3.015552 (Likelihood ratio/deviance statistic) 
## X^2(3): 3.069606 (Chi-square goodness of fit) 
## 

4. Table of Fits

# generate dataframe with fit-values
# See https://statistics.ohlsen-web.de/latent-class-analysis-polca/
#code and calucate the entropy by hand 
results <- data.frame(Modell=c("Model 1"),
                      log_likelihood=lc1$llik,
                      df = lc1$resid.df,
                      BIC=lc1$bic)
results$Modell<-as.integer(results$Modell)
## Warning: NAs introduced by coercion
results[1,1]<-c("Model 1")
results[2,1]<-c("Model 2")
results[3,1]<-c("Model 3")

results[2,2]<-lc2$llik
results[3,2]<-lc3$llik
results[2,3]<-lc2$resid.df
results[3,3]<-lc3$resid.df
results[2,4]<-lc2$bic
results[3,4]<-lc3$bic

entropy<-function (p) sum(-p*log(p))

results$R2_entropy
## NULL
results[1,5]<-c("-")

error_prior<-entropy(lc2$P) # class proportions model 2
error_post<-mean(apply(lc2$posterior,1, entropy),na.rm = TRUE)
results[2,5]<-round(((error_prior-error_post) / error_prior),3)

error_prior<-entropy(lc3$P) # class proportions model 3
error_post<-mean(apply(lc3$posterior,1, entropy),na.rm = TRUE)
results[3,5]<-round(((error_prior-error_post) / error_prior),3)

# combining results to a dataframe
colnames(results)<-c("Model","log-likelihood","resid. df","BIC","Entropy")
lca_results<-results
lca_results
##     Model log-likelihood resid. df      BIC Entropy
## 1 Model 1      -1383.353        11 2791.564       -
## 2 Model 2      -1325.213         6 2706.357   0.503
## 3 Model 3      -1320.349         1 2727.703   0.644
#Note. MPlus gives higher entropy but otherwise similar results (.701)?!

5. Class memberships

#2 class
#population-shares of the classes, in %
round(colMeans(lc2$posterior)*100,2)
## [1] 51.02 48.98
# estimated class memberships
table(lc2$predclass)
## 
##   1   2 
## 272 228
#3 class
#population-shares of the classes, in %
round(colMeans(lc3$posterior)*100,2)
## [1] 57.29  8.50 34.21
# estimated class memberships
table(lc3$predclass)
## 
##   1   2   3 
## 294  38 168