Fit stats for 4 time point, 3 and 4 class solutions

d <- read_csv("d.csv")

## Parsed with column specification:
## cols(
##   x6c_16_1 = col_double(),
##   x6c_6_2 = col_double(),
##   x7c_12_2 = col_double(),
##   x7c_8_3 = col_double(),
##   x8b_6_2 = col_double()
## )

m3 <- read_rds("m3.rds")
m4 <- read_rds("m4.rds")

post_probs <- m3$posterior %>% as.data.frame() %>% setNames(paste0("C", 1:3, "_prob"))
df <- bind_cols(d, post_probs)
df$class <- m3$predclass
df3 <- df %>% mutate_if(is.numeric, round, 3)

post_probs <- m4$posterior %>% as.data.frame() %>% setNames(paste0("C", 1:4, "_prob"))
df <- bind_cols(d, post_probs)
df$class <- m4$predclass
df4 <- df %>% mutate_if(is.numeric, round, 3)

Comparing the entropy for 3 and 4 class solutions

A number taking a minumum value of 0 (representing complete concentration of probability on one cell) and a maximum value equal to the logarithm of the total number of cells in the fitted cross-classfication table (representing complete dispersion, or equal probability for outcomes across every cell).

poLCA.entropy(m3)

## [1] 4.914149

poLCA.entropy(m4)

## [1] 4.846396

Are certain codes more likely across classes?

For the code 1 in the 4-class solution

dfa <- df %>% 
    dplyr::select(-x8b_6_2) %>% 
    gather(time, code, -c(C1_prob:class)) %>% 
    dplyr::select(time, code, everything()) %>% 
    mutate(`1` = if_else(code == 1, 1, 0))
    
dummies <- as_tibble(dummies::dummy(dfa$code))

dfa <- bind_cols(dfa, dummies)

summary(aov(`1` ~ as.factor(class), data = dfa))

##                    Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(class)    3  98.79   32.93   221.7 <2e-16 ***
## Residuals        1464 217.45    0.15                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1552 observations deleted due to missingness

TukeyHSD(aov(`1` ~ as.factor(class), data = dfa))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = `1` ~ as.factor(class), data = dfa)
## 
## $`as.factor(class)`
##           diff         lwr       upr     p adj
## 2-1 0.19379941  0.12186920 0.2657296 0.0000000
## 3-1 0.22242235  0.14091185 0.3039328 0.0000000
## 4-1 0.64585937  0.57901766 0.7127011 0.0000000
## 3-2 0.02862294 -0.05526239 0.1125083 0.8164891
## 4-2 0.45205996  0.38234195 0.5217780 0.0000000
## 4-3 0.42343703  0.34387191 0.5030021 0.0000000

Code 1 is more likely in class 4 than class 3, 2, and 1
Code 1 is more liely in classes 3 and 2 than in class 1, though Code 1 does not differ between classes 3 and 2

Are certain codes in one class more likely at a specific timepoint?

For code 1 for class 1 in the 4-class solution

dfa_class1 <- filter(dfa, class == 1, time != "x8b_6_2")

summary(aov(`1` ~ time, data = dfa_class1))

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## time          3  0.471 0.15689   6.876 0.000158 ***
## Residuals   407  9.286 0.02282                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 385 observations deleted due to missingness

TukeyHSD(aov(`1` ~ time, data = dfa_class1))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = `1` ~ time, data = dfa_class1)
## 
## $time
##                            diff          lwr         upr     p adj
## x6c_6_2-x6c_16_1   6.344086e-02  0.002496708  0.12438501 0.0376807
## x7c_12_2-x6c_16_1 -3.333333e-02 -0.085272320  0.01860565 0.3487762
## x7c_8_3-x6c_16_1  -3.333333e-02 -0.083330198  0.01666353 0.3146182
## x7c_12_2-x6c_6_2  -9.677419e-02 -0.159074284 -0.03447410 0.0004236
## x7c_8_3-x6c_6_2   -9.677419e-02 -0.157464632 -0.03608375 0.0002748
## x7c_8_3-x7c_12_2  -1.734723e-17 -0.051641050  0.05164105 1.0000000

Code 1 is more likely in class 1 at 7c-6.2 than 7c-12.2
Code 1 is more likely in class 1 at 6c-6.2 than 7c-8.3
Code 1 is more likely in class 1 at 6c-16.1 than 6c-6.2

Are certain codes more frequent in certain classes?

Frequency is used instead of likelihood since these use the ‘hard’ classifications

mat <- dfa %>% 
    filter(code != 5) %>% 
    count(code, class) %>% 
    spread(class, n) %>% 
    dplyr::select(-code)

# code is rows, class is columns

c <- chisq.test(mat)
c

## 
##  Pearson's Chi-squared test
## 
## data:  mat
## X-squared = 1245.8, df = 9, p-value < 2.2e-16

c$stdres

##               1          2         3          4
## [1,] -14.912776  -4.454544 -2.400090  20.271464
## [2,]  19.999495 -12.432497 -4.828079  -4.083248
## [3,]  -5.961178  -4.476029 17.821331  -4.065776
## [4,]  -1.280305  20.539901 -4.403082 -14.122745

Code is 1 more frequent in class 4 (relative to the other classes)
Code 2 is more frequent in class 1
Code 3 is more frequent in class 3
Code 4 is more frequent in class 2