d <- read_csv("ds.csv")
## Parsed with column specification:
## cols(
##   x6c_16_1 = col_double(),
##   x6c_6_2 = col_double(),
##   x7c_12_2 = col_double(),
##   x7c_8_3 = col_double(),
##   x8b_6_2 = col_double(),
##   flag = col_double()
## )
m3 <- read_rds("m3.rds")
m4 <- read_rds("m4.rds")
post_probs <- m3$posterior %>% as.data.frame() %>% setNames(paste0("C", 1:3, "_prob"))
df <- bind_cols(d, post_probs)
df$class <- m3$predclass
df3 <- df %>% mutate_if(is.numeric, round, 3)
post_probs <- m4$posterior %>% as.data.frame() %>% setNames(paste0("C", 1:4, "_prob"))
df <- bind_cols(d, post_probs)
df$class <- m4$predclass
df4 <- df %>% mutate_if(is.numeric, round, 3)

Comparing the entropy for 3 and 4 class solutions

A number taking a minumum value of 0 (representing complete concentration of probability on one cell) and a maximum value equal to the logarithm of the total number of cells in the fitted cross-classfication table (representing complete dispersion, or equal probability for outcomes across every cell).

poLCA.entropy(m3)
## [1] 4.905256
poLCA.entropy(m4)
## [1] 4.834834

Mean classification probability - 4 class solution

By class

mat <- df4 %>% dplyr::select(C1_prob:C4_prob, class)
mat %>% group_by(class) %>% summarize_all(funs(mean)) %>% dplyr::select(-class) %>% as.matrix() %>% diag()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
## [1] 0.8789825 0.7264737 0.7325273 0.8079130

Overall

mat %>% group_by(class) %>% summarize_all(funs(mean)) %>% dplyr::select(-class) %>% as.matrix() %>% diag() %>% mean()
## [1] 0.7864741

Mean classification probability - 3 class solution

By class

mat <- df3 %>% dplyr::select(C1_prob:C3_prob, class)
mat %>% group_by(class) %>% summarize_all(funs(mean)) %>% dplyr::select(-class) %>% as.matrix() %>% diag()
## [1] 0.8584536 0.8954091 0.7340893

Overall

mat %>% group_by(class) %>% summarize_all(funs(mean)) %>% dplyr::select(-class) %>% as.matrix() %>% diag() %>% mean()
## [1] 0.8293173

Are certain codes more likely across classes?

For the code 1 in the 4-class solution

dfa <- df %>% 
    dplyr::select(-x8b_6_2) %>% 
    gather(time, code, -c(C1_prob:class)) %>% 
    dplyr::select(time, code, everything()) %>% 
    mutate(`1` = if_else(code == 1, 1, 0))
    
dummies <- as_tibble(dummies::dummy(dfa$code))

dfa <- bind_cols(dfa, dummies)

summary(aov(`1` ~ as.factor(class), data = dfa))
##                   Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(class)   3  21.45   7.149    31.6 <2e-16 ***
## Residuals        920 208.15   0.226                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 171 observations deleted due to missingness
TukeyHSD(aov(`1` ~ as.factor(class), data = dfa))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = `1` ~ as.factor(class), data = dfa)
## 
## $`as.factor(class)`
##            diff         lwr          upr     p adj
## 2-1  0.18387097  0.05772167  0.310020270 0.0010706
## 3-1  0.05064935 -0.06218897  0.163487673 0.6552554
## 4-1  0.36442953  0.25825138  0.470607682 0.0000000
## 3-2 -0.13322162 -0.26033158 -0.006111655 0.0357653
## 4-2  0.18055856  0.05932220  0.301794925 0.0007782
## 4-3  0.31378018  0.20646245  0.421097913 0.0000000

Are certain codes in one class more likely at a specific timepoint?

For code 1 for class 1 in the 4-class solution

dfa_class1 <- filter(dfa, class == 1, time != "x8b_6_2")

summary(aov(`1` ~ time, data = dfa_class1))
##              Df Sum Sq Mean Sq F value Pr(>F)    
## time          4  40.18  10.046   231.1 <2e-16 ***
## Residuals   235  10.22   0.043                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 45 observations deleted due to missingness
TukeyHSD(aov(`1` ~ time, data = dfa_class1))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = `1` ~ time, data = dfa_class1)
## 
## $time
##                            diff        lwr         upr     p adj
## x6c_16_1-flag     -9.245283e-01 -1.0339057 -0.81515093 0.0000000
## x6c_6_2-flag      -5.925926e-01 -0.7265069 -0.45867832 0.0000000
## x7c_12_2-flag     -1.000000e+00 -1.1099210 -0.89007900 0.0000000
## x7c_8_3-flag      -1.000000e+00 -1.1104831 -0.88951689 0.0000000
## x6c_6_2-x6c_16_1   3.319357e-01  0.1964069  0.46746454 0.0000000
## x7c_12_2-x6c_16_1 -7.547170e-02 -0.1873540  0.03641065 0.3450687
## x7c_8_3-x6c_16_1  -7.547170e-02 -0.1879063  0.03696295 0.3501576
## x7c_12_2-x6c_6_2  -4.074074e-01 -0.5433753 -0.27143947 0.0000000
## x7c_8_3-x6c_6_2   -4.074074e-01 -0.5438302 -0.27098464 0.0000000
## x7c_8_3-x7c_12_2   1.110223e-15 -0.1129636  0.11296356 1.0000000

Homogeneity and Separation

When latent class c is highly homogeneous, members of are likely to provide the same observed response pattern, implying that one response pattern is characteristic of the class. If homogeneity is low, no single response pattern stands out.

When latent class separation is good, the pattern of item-response probabiltiies across indicator variables clearly differentiates among the latent classes. When there is high separation, a response pattern that enmerges as characteristic of a particular latent class will be characteristic of that class only (and will not be characteristic of any of the other latent classes).

3 class solution - mini

df %>% 
    dplyr::select(-class) %>% 
    gather(class, prob, C1_prob, C2_prob, C3_prob, C4_prob) %>% 
    mutate_if(is.integer, replace_na, 0) %>% 
    unite(response_pattern, x6c_16_1, x6c_6_2, x7c_12_2, x7c_8_3, sep = "-") %>% 
    group_by(class, response_pattern) %>% 
    summarize(mean_post_prob = mean(prob),
              n = n()) %>% 
    ungroup() %>% 
    spread(class, mean_post_prob) %>% 
    mutate_if(is.double, round, 3) %>% 
    arrange(desc(n))
## # A tibble: 143 x 6
##    response_pattern     n C1_prob C2_prob C3_prob C4_prob
##    <chr>            <int>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 2-NA-2-2             8   0.917   0       0.083   0    
##  2 2-NA-2-4             5   0.873   0       0.127   0    
##  3 2-NA-4-4             5   0.488   0       0.512   0    
##  4 4-NA-4-4             5   0.253   0.747   0       0    
##  5 2-2-1-1              4   0       0       0.176   0.824
##  6 2-2-4-2              4   0.449   0       0.551   0    
##  7 2-NA-1-2             4   0       0       0.145   0.855
##  8 1-NA-1-1             3   0       0.145   0.096   0.759
##  9 1-NA-1-2             3   0       0       0.133   0.867
## 10 1-NA-1-NA            3   0       0.154   0.15    0.695
## # … with 133 more rows

4 class solution

df %>% 
    dplyr::select(-class) %>% 
    gather(class, prob, C1_prob, C2_prob, C3_prob, C4_prob) %>% 
    mutate_if(is.integer, replace_na, 0) %>% 
    unite(response_pattern, x6c_16_1, x6c_6_2, x7c_12_2, x7c_8_3, sep = "-") %>% 
    mutate(n_NA = str_count(response_pattern, "NA")) %>% 
    filter(n_NA <= 2) %>% 
    group_by(class, response_pattern) %>% 
    summarize(mean_post_prob = mean(prob),
              n = n()) %>% 
    ungroup() %>% 
    spread(class, mean_post_prob) %>% 
    mutate_if(is.double, round, 3) %>% 
    arrange(desc(n))
## # A tibble: 143 x 6
##    response_pattern     n C1_prob C2_prob C3_prob C4_prob
##    <chr>            <int>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 2-NA-2-2             8   0.917   0       0.083   0    
##  2 2-NA-2-4             5   0.873   0       0.127   0    
##  3 2-NA-4-4             5   0.488   0       0.512   0    
##  4 4-NA-4-4             5   0.253   0.747   0       0    
##  5 2-2-1-1              4   0       0       0.176   0.824
##  6 2-2-4-2              4   0.449   0       0.551   0    
##  7 2-NA-1-2             4   0       0       0.145   0.855
##  8 1-NA-1-1             3   0       0.145   0.096   0.759
##  9 1-NA-1-2             3   0       0       0.133   0.867
## 10 1-NA-1-NA            3   0       0.154   0.15    0.695
## # … with 133 more rows