test-lca.knit

library(haven)

## Warning: package 'haven' was built under R version 4.4.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(poLCA)

## Warning: package 'poLCA' was built under R version 4.4.3

## Loading required package: scatterplot3d

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 4.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(stringr)
library(psych)
library(dplyr)
library(tidyr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(tibble)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.4.3

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(tidyLPA)

## Warning: package 'tidyLPA' was built under R version 4.4.3

## You can use the function citation('tidyLPA') to create a citation for the use of {tidyLPA}.
## Mplus is installed; you can use package = 'MplusAutomation' when calling estimate_profiles().

library(poLCAExtra)

## 
## Attaching package: 'poLCAExtra'

## The following object is masked from 'package:poLCA':
## 
##     poLCA

amf_df <- read_sav("files/WIDE_AMF_webbqps_loose.prepared_Merged with Users.sav")
amf_df <- as_factor(amf_df)
label_map <- c(
  "fia1A" = "Support",
  "fia2A" = "Support",
  "fia3A" = "Support",
  "fia4A" = "Support",
  "fia5A" = "Support",
  "fia1B" = "Support",
  "fia2B" = "Support",
  "fia3B" = "Support",
  "fia4B" = "Support",
  "fia5B" = "Support",
  "fia6A" = "Demands",
  "fia7A" = "Demands",
  "fia8A" = "Demands",
  "fia9A" = "Demands",
  "fia6B" = "Demands",
  "fia7B" = "Demands",
  "fia8B" = "Demands",
  "fia9B" = "Demands",
  "fia10A" = "Sense of Coherence",
  "fia11A" = "Sense of Coherence",
  "fia12A" = "Sense of Coherence",
  "fia10B" = "Sense of Coherence",
  "fia11B" = "Sense of Coherence",
  "fia12B" = "Sense of Coherence",
  "fia14A" = "Clarity",
  "fia15A" = "Clarity",
  "fia14B" = "Clarity",
  "fia15B" = "Clarity",
  "fia16A" = "Control",
  "fia17A" = "Control",
  "fia18A" = "Control",
  "fia19A" = "Control",
  "fia16B" = "Control",
  "fia17B" = "Control",
  "fia18B" = "Control",
  "fia19B" = "Control"
)

entropy_LCA <- function(model) {
  K.j <- sapply(model$probs, ncol)
  if(length(unique(K.j))==1) {
    fullcell <- expand.grid(data.frame(sapply(K.j, seq, from=1)))
  } else {
    fullcell <- expand.grid(sapply(K.j, seq, from = 1))
  }
  P.c <- poLCA.predcell(model, fullcell)
  return(-sum(P.c * log(P.c), na.rm = TRUE))
}

relative_entropy <- function(model) {
  en <- -sum(model$posterior * log(model$posterior), na.rm=T)
  e <- 1 - en / (nrow(model$posterior) * log(ncol(model$posterior)))
  return(e)
}

lca_execute <- function(df, lca_formula, maxiter) {
  results <- list()
  fit_stats <- data.frame()
  N <- nrow(df) 
  
  for (k in 2:maxiter) {
    set.seed(100)
    model <- poLCA(lca_formula, data = df, nclass = k, graphs = FALSE)
    post_probs <- model$posterior
   
    relative_entro <- poLCA.relentropy(model)
    
    aBIC <- -2 * model$llik + log((N + 2) / 24) * model$npar  
    
    if(k > 2) {
      base <- results[[paste0("Class_", k-1)]]
    } else {
      base <- poLCA(lca_formula, data = df, nclass = 1, graphs = FALSE)
    }
    
    lmr <- poLCA.lrt(model, base)
    
    results[[paste0("Class_", k)]] <- model
    fit_stats <- rbind(fit_stats,
                       data.frame(
                         Classes = k,
                         LL = model$llik,
                         AIC = model$aic,
                         BIC = model$bic,
                         aBIC = aBIC,
                         Relative_Entropy = relative_entro,
                         LMR_LRT_p_val = round(lmr$lmr.p,8)
                       ))
  }
  print(fit_stats, row.names=FALSE)
  
  fit_melt <- melt(fit_stats, id.vars = "Classes", 
                 measure.vars = c("LL", "AIC", "BIC", "aBIC", "Relative_Entropy"))


  plot_ep <- ggplot(fit_melt, aes(x = Classes, y = value)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  facet_wrap(~ variable, scales = "free_y", ncol=1, strip.position = "right") 
  theme_minimal() +
  labs(title = "Elbow Plot for LCA Fit Statistics",
       x = "Number of Classes",
       y = "Fit Statistic") +
  theme(plot.title = element_text(hjust = 0.5))
 
  print(plot_ep)
  return(results)
}

filter_Columns_by_regex <- function(df, col_filtering_regex) {
  cols_to_keep <- grep(col_filtering_regex, names(df), perl = TRUE, value = TRUE)
  filtered_df <- df %>%
    filter(sysConsentYN == "Yes")
  filtered_df <- filtered_df[, cols_to_keep]
  filtered_df <- na.omit(filtered_df)
  return(filtered_df)
}


get_lca_long_format <- function(best_model, label_map) {
  probs <- best_model$probs
  
  prob_df <- lapply(names(probs), function(var) {
    df <- as.data.frame(probs[[var]])
    df$class <- rownames(df)
    df$variable <- var
    df_long <- df %>%
      tidyr::pivot_longer(
        cols = starts_with("Pr"),
        names_to = "response",
        values_to = "probability"
      )
    return(df_long)
  }) %>% dplyr::bind_rows()
  
  prob_df <- prob_df %>%
    dplyr::mutate(
      clean_var = sub("\\.$", "", variable),
      label_prefix = label_map[clean_var],
      label = ifelse(is.na(label_prefix), variable, paste(label_prefix, "(", variable, ")"))
    ) %>%
    dplyr::arrange(label_prefix, variable) %>%
    dplyr::mutate(label = factor(label, levels = unique(label)))
  
  
  return(prob_df)
}

seg_barp_lca <- function(prob_df) {
  classes <- unique(prob_df$class)
  for (cls in classes) {
    df_subset <- dplyr::filter(prob_df, class == cls)
    
    p <- ggplot2::ggplot(df_subset, ggplot2::aes(x = label, y = probability, fill = response)) +
      ggplot2::geom_bar(stat = "identity") +
      ggplot2::geom_text(
        ggplot2::aes(label = ifelse(probability > 0.05, round(probability, 2), "")),
        position = ggplot2::position_stack(vjust = 0.5),
        size = 3, color = "black"
      ) +
      ggplot2::coord_flip() +
      ggplot2::labs(
        title = paste("LCA -", cls),
        x = "Variable",
        y = "Probability"
      ) +
      ggplot2::theme_minimal() +
      ggplot2::scale_fill_manual(values=c("#71C055", "#B8DB70","#F9F18C", "#FF8686", "#D73343"))
    
    print(p)
  }
}

time_point <- 1
cat_variables <- c('wh_conflict', 'hw_conflict')
existing_vars <- names(amf_df)
filtered_df <- amf_df %>% filter(sysConsentYN == "Yes")
vars_to_keep <- unlist(lapply(cat_variables, function(v) {
  candidates <- c(v, paste0(v, ".", time_point))
  candidates[candidates %in% existing_vars]
}))

lca_df <- filtered_df[, vars_to_keep, drop = FALSE]
f <- as.formula(paste("cbind(", paste(names(lca_df), collapse = ","), ") ~ 1"))
print(f)

## cbind(wh_conflict.1, hw_conflict.1) ~ 1

lca_results <- lca_execute(df = lca_df, lca_formula = f, maxiter = 7)

##  Classes        LL      AIC      BIC     aBIC Relative_Entropy LMR_LRT_p_val
##        2 -3301.646 6637.291 6724.973 6670.972        0.6985533    0.00000000
##        3 -3265.271 6582.542 6716.643 6634.054        0.5768910    0.00000000
##        4 -3241.218 6552.436 6732.957 6621.780        0.5894423    0.00000061
##        5 -3229.228 6546.456 6773.396 6633.630        0.5787646    0.00639346
##        6 -3229.228 6564.456 6837.816 6669.461        0.4881743    1.00000000
##        7 -3229.228 6582.456 6902.235 6705.292        0.4629860    1.00000000

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

prob_df <- get_lca_long_format(best_model = lca_results[["Class_3"]], label_map)
write.csv(prob_df, paste("probabilities/WLB_", time_point, ".csv"), row.names = FALSE)
print(prob_df)

## # A tibble: 30 × 7
##    class       variable      response probability clean_var   label_prefix label
##    <chr>       <chr>         <chr>          <dbl> <chr>       <chr>        <fct>
##  1 "class 1: " hw_conflict.1 Pr(1)      8.64e-  1 hw_conflic… <NA>         hw_c…
##  2 "class 1: " hw_conflict.1 Pr(2)      9.46e-  2 hw_conflic… <NA>         hw_c…
##  3 "class 1: " hw_conflict.1 Pr(3)      3.94e-  2 hw_conflic… <NA>         hw_c…
##  4 "class 1: " hw_conflict.1 Pr(4)      2.00e-  3 hw_conflic… <NA>         hw_c…
##  5 "class 1: " hw_conflict.1 Pr(5)      3.59e-274 hw_conflic… <NA>         hw_c…
##  6 "class 2: " hw_conflict.1 Pr(1)      9.94e-  2 hw_conflic… <NA>         hw_c…
##  7 "class 2: " hw_conflict.1 Pr(2)      6.54e-  1 hw_conflic… <NA>         hw_c…
##  8 "class 2: " hw_conflict.1 Pr(3)      1.46e-  1 hw_conflic… <NA>         hw_c…
##  9 "class 2: " hw_conflict.1 Pr(4)      5.34e-  2 hw_conflic… <NA>         hw_c…
## 10 "class 2: " hw_conflict.1 Pr(5)      4.65e-  2 hw_conflic… <NA>         hw_c…
## # ℹ 20 more rows

print(lca_results[["Class_3"]]$predcell)

##    wh_conflict.1 hw_conflict.1 observed expected
## 1              1             1      191  190.988
## 2              1             2       23   23.063
## 3              1             3       12   11.922
## 4              1             4        1    1.027
## 5              2             1      103  103.540
## 6              2             2      212  198.926
## 7              2             3       53   55.380
## 8              2             4       17   17.124
## 9              2             5        3   13.030
## 10             3             1       87   87.485
## 11             3             2      173  166.622
## 12             3             3      180  171.264
## 13             3             4       19   32.867
## 14             3             5        3    3.763
## 15             4             1       36   35.368
## 16             4             2       54   61.809
## 17             4             3       48   56.890
## 18             4             4       26   11.178
## 19             4             5        3    1.756
## 20             5             1       10    9.620
## 21             5             2       10   21.580
## 22             5             3        8    5.545
## 23             5             4        1    1.805
## 24             5             5       11    1.451

class_colors <- c("#71C055", "#56B4E9", "#FBA51A", "#CE93D9", "#D73343")

prob_df %>%
  ggplot(aes(x = response, y = probability, group = class, color = class)) +
  geom_line(size = 1.1) +
  geom_point(size = 2) +
  facet_wrap(~ clean_var, scales = "fixed") +
  scale_y_continuous(limits = c(0,1)) +
  scale_color_manual(values = class_colors) +  
  labs(
    title = "Conditional Response Probabilities by Latent Class",
    x = "Response Category",
    y = "Probability",
    color = "Latent Class"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    strip.text = element_text(face = "bold", size = 10),
    legend.position = "bottom"
  )

colors <- c("#71C055", "#56B4E9", "#FBA51A", "#CE93D9", "#D73343")

ggplot(prob_df, aes(x = label, y = probability, fill = response)) +
  geom_bar(stat = "identity", color = "white") +
  geom_text(
    aes(label = ifelse(probability > 0.05, round(probability, 2), "")),
    position = position_stack(vjust = 0.5),
    size = 3,
    color = "black"
  ) +
  coord_flip() +
  facet_wrap(~ class, ncol = 1, strip.position = "right") +   
  scale_fill_manual(values = colors) +                        
  labs(
    title = "Conditional Response Probabilities by Latent Class",
    x = "Variable",
    y = "Probability",
    fill = "Response Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5),
    strip.text = element_text(face = "bold", size = 10),
    axis.text.y = element_text(size = 9),
    legend.position = "bottom"
  )

time_point <- 1
cat_variables <- c('wh_conflict', 'hw_conflict')
other_variables <- c('sex', 'edu')
filtered_df <- amf_df %>% filter(sysConsentYN == "Yes")
cat_vars_wtime <- grep(
  paste0("^(", paste(cat_variables, collapse = "|"), ")\\.", time_point, "$"),
  names(filtered_df),
  value = TRUE
)
vars_to_keep <- c(other_variables, cat_vars_wtime)
vars_to_keep <- as.character(vars_to_keep)
lca_df <- filtered_df[, vars_to_keep, drop = FALSE]
f <- as.formula(paste("cbind(", paste(cat_vars_wtime, collapse = ","), ") ~ ", paste(other_variables, collapse = "+")))
print(f)

## cbind(hw_conflict.1, wh_conflict.1) ~ sex + edu

lca_results <- lca_execute(df = lca_df, lca_formula = f, maxiter = 7)

## 
##  ALERT: covariates not allowed when nclass=1; will be ignored. 
##  
##  Classes        LL      AIC      BIC     aBIC Relative_Entropy LMR_LRT_p_val
##        2 -3285.783 6613.566 6721.878 6655.172        0.8265238    0.00000000
##        3 -3305.089 6678.178 6853.541 6745.541        0.8936882    1.00000000
##        4 -3438.426 6970.853 7213.266 7063.971        0.9986964    1.00000000
##        5 -3286.924 6693.848 7003.312 6812.723        0.9095319    0.00000000
##        6 -3449.341 7044.682 7421.196 7189.312        0.9022418    1.00000000
##        7 -3431.373 7034.746 7478.311 7205.133        0.9988629    0.00106967

prob_df <- get_lca_long_format(best_model = lca_results[["Class_3"]], label_map)
write.csv(prob_df, paste("probabilities/WLB_", time_point, ".csv"), row.names = FALSE)
print(prob_df)

## # A tibble: 30 × 7
##    class       variable      response probability clean_var   label_prefix label
##    <chr>       <chr>         <chr>          <dbl> <chr>       <chr>        <fct>
##  1 "class 1: " hw_conflict.1 Pr(1)       4.76e- 1 hw_conflic… <NA>         hw_c…
##  2 "class 1: " hw_conflict.1 Pr(2)       2.62e- 1 hw_conflic… <NA>         hw_c…
##  3 "class 1: " hw_conflict.1 Pr(3)       2.14e- 1 hw_conflic… <NA>         hw_c…
##  4 "class 1: " hw_conflict.1 Pr(4)       4.76e- 2 hw_conflic… <NA>         hw_c…
##  5 "class 1: " hw_conflict.1 Pr(5)       0        hw_conflic… <NA>         hw_c…
##  6 "class 2: " hw_conflict.1 Pr(1)       1.00e+ 0 hw_conflic… <NA>         hw_c…
##  7 "class 2: " hw_conflict.1 Pr(2)       1.95e- 9 hw_conflic… <NA>         hw_c…
##  8 "class 2: " hw_conflict.1 Pr(3)       2.24e-93 hw_conflic… <NA>         hw_c…
##  9 "class 2: " hw_conflict.1 Pr(4)       0        hw_conflic… <NA>         hw_c…
## 10 "class 2: " hw_conflict.1 Pr(5)       0        hw_conflic… <NA>         hw_c…
## # ℹ 20 more rows

print(lca_results[["Class_3"]]$predcell)

##    hw_conflict.1 wh_conflict.1 observed expected
## 1              1             1      191  177.365
## 2              1             2      103  106.642
## 3              1             3       87   95.374
## 4              1             4       36   36.853
## 5              1             5       10   10.766
## 6              2             1       23   26.912
## 7              2             2      212  155.009
## 8              2             3      173  202.194
## 9              2             4       54   71.744
## 10             2             5       10   16.140
## 11             3             1       12   17.917
## 12             3             2       53   98.764
## 13             3             3      180  128.458
## 14             3             4       48   45.637
## 15             3             5        8   10.223
## 16             4             1        1    3.843
## 17             4             2       17   20.996
## 18             4             3       19   27.292
## 19             4             4       26    9.699
## 20             4             5        1    2.171
## 21             5             2        3    6.589
## 22             5             3        3    8.681
## 23             5             4        3    3.067
## 24             5             5       11    0.700

class_colors <- c("#71C055", "#56B4E9", "#FBA51A", "#CE93D9", "#D73343","black",'grey')

prob_df %>%
  ggplot(aes(x = response, y = probability, group = class, color = class)) +
  geom_line(size = 1.1) +
  geom_point(size = 2) +
  facet_wrap(~ clean_var, scales = "fixed") +
  scale_y_continuous(limits = c(0,1)) +
  scale_color_manual(values = class_colors) +  
  labs(
    title = "Conditional Response Probabilities by Latent Class",
    x = "Response Category",
    y = "Probability",
    color = "Latent Class"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    strip.text = element_text(face = "bold", size = 10),
    legend.position = "bottom"
  )

colors <- c("#71C055", "#56B4E9", "#FBA51A", "#CE93D9", "#D73343")

ggplot(prob_df, aes(x = label, y = probability, fill = response)) +
  geom_bar(stat = "identity", color = "white") +
  geom_text(
    aes(label = ifelse(probability > 0.05, round(probability, 2), "")),
    position = position_stack(vjust = 0.5),
    size = 3,
    color = "black"
  ) +
  coord_flip() +
  facet_wrap(~ class, ncol = 1, strip.position = "right") +   
  scale_fill_manual(values = colors) +                        
  labs(
    title = "Conditional Response Probabilities by Latent Class",
    x = "Variable",
    y = "Probability",
    fill = "Response Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5),
    strip.text = element_text(face = "bold", size = 10),
    axis.text.y = element_text(size = 9),
    legend.position = "bottom"
  )

exp(lca_results[["Class_3"]]$coeff)

##                                         [,1]         [,2]
## (Intercept)                     4.819154e-07 2.531551e-07
## sexMan                          4.120208e-01 5.130656e-01
## eduSenior high school           4.489604e+12 2.308020e+13
## eduAcademic degree (BA/MA)      8.876283e+11 8.425918e+12
## eduHigher academic degree (PhD) 1.520127e+04 2.026670e+15

odds_ratios <- exp(lca_results[["Class_3"]]$coeff)

df <- as.data.frame(odds_ratios)
df$Variable <- rownames(df)

df_long <- df %>%
  pivot_longer(cols = -Variable, names_to = "Class", values_to = "OddsRatio")

ggplot(df_long, aes(x = OddsRatio, y = reorder(Variable, OddsRatio), fill = Class)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_vline(xintercept = 1, linetype = "dashed", color = "black") +
  labs(x = "Odds Ratio", y = "", title = "Odds Ratios by Variable and Class") +
  theme_minimal() +
  theme(legend.position = "bottom")