# Load required libraries
library(tidyverse)
library(tidyLPA)
library(gt)
library(ggplot2)
library(corrr)
library(plotly)
library(RColorBrewer)
library(scales)
library(patchwork)

library(afcommon)
source("af_common_add_ons.R")

# Source helper functions
source("af_gauge_indices.R")
source("af_lpa.R")

1 Introduction

1.1 Research Objective

This analysis aims to identify distinct latent profiles within our sample based on multiple continuous variables. Following profile identification, we examine how these profiles correlate with various binary indicators to understand the broader behavioral and attitudinal patterns associated with different profiles.

1.2 What is Latent Profile Analysis?

Latent Profile Analysis (LPA) is a person-centered statistical approach that identifies unobserved subgroups (latent classes) within a population based on patterns of responses across multiple continuous variables. Unlike variable-centered approaches that examine relationships between variables, LPA focuses on identifying distinct groups of individuals who share similar response patterns.

1.2.1 Key Benefits of LPA:

  • Person-centered approach: Groups individuals rather than examining variable relationships
  • Heterogeneity detection: Identifies subpopulations that may be masked in aggregate analyses
  • Profile interpretation: Provides meaningful, interpretable group characteristics
  • Probabilistic classification: Accounts for uncertainty in group membership
  • Model comparison: Statistical criteria help determine optimal number of profiles

1.2.2 LPA Process:

  1. Model specification: Define the number of profiles and model constraints
  2. Model fitting: Estimate parameters using maximum likelihood estimation
  3. Model selection: Compare models using information criteria (AIC, BIC, SABIC)
  4. Profile interpretation: Examine mean differences across profiles
  5. Classification: Assign individuals to most likely profile membership

2 LPA Configuration

# =============================================================================
# LPA CONFIGURATION SECTION - MODIFY ALL PARAMETERS HERE
# =============================================================================

# Profile variables (continuous indicators for LPA)
profile_vars <- c("pe_ideology", "pe_violence", "pe_intolerance")

# Variable labels for display (must match order of profile_vars)
profile_var_labels <- c("Cognitive", "Behavioral", "Social")

# Binary correlation variables  
binary_vars <- c("i_er1", "i_er2", "i_er3")

# Binary variable labels for display (must match order of binary_vars)
binary_var_labels <- c("ER1", "ER2", "ER3")

# Number of profiles to test (range)
min_profiles <- 3
max_profiles <- 5

# Model specifications to test (tidyLPA models)
models_to_test <- c(1, 2, 3, 6)  # c(1, 2, 3, 6)
                                 # 1=equal variances/equal covariances, 
                                 # 2=varying variances/equal covariances,
                                 # 3=equal variances/varying covariances, 
                                 # 6=varying variances/varying covariances

# Analysis title and subtitle for reports
analysis_title <- "Latent Profile Analysis Results"
analysis_subtitle <- "Identification and characterization of latent profiles"

# Variable interpretation direction (TRUE = higher values are "more extreme/higher")
# Set to FALSE if lower values represent higher levels of the construct
var_direction <- c(TRUE, TRUE, TRUE)  # Must match length of profile_vars

# Significance level for interpretations
alpha_level <- 0.05

# Color palette for visualizations
profile_colors <- viridis::viridis(max_profiles)

# Random seed for reproducibility
set.seed(123)

# Create named vectors for easier reference
names(profile_var_labels) <- profile_vars
names(binary_var_labels) <- binary_vars
names(var_direction) <- profile_vars
# Load and prepare data
df <- as.data.frame(readRDS("Israel Survey/data/il_pe.RDS"))
indices <- af_gauge_indices(df, pop_var1 = "Wave", comm_var1 = "pe_left_center_right", 
                            threshold_type = "MAD", k_factor = 1.5)

# Analyze data of one wave 
data <- indices$df # %>% filter(Wave == "First")

# Validate data structure
data_is_valid <- validate_data(data, profile_vars, binary_vars)

# Prepare analysis data
required_vars <- c(profile_vars, binary_vars)
missing_vars <- setdiff(required_vars, names(data))
if(length(missing_vars) > 0) {
  stop(paste("Missing variables:", paste(missing_vars, collapse = ", ")))
}

# Remove cases with missing values on profile variables
complete_cases <- complete.cases(data[profile_vars])
analysis_data <- data[complete_cases, ]
  • Original sample size: 7436
  • Analysis sample size:” 7436
  • Cases removed due to missing data: 0
  • Profile variables:” pe_ideology, pe_violence, pe_intolerance
  • Binary variables: i_er1, i_er2, i_er3

3 LPA Model Selection and Fitting

# Fit multiple LPA models
model_results <- fit_lpa_models(analysis_data, profile_vars, 
                                min_profiles, max_profiles, models_to_test)

# Create model comparison table
model_comparison_table <- create_model_comparison_table(model_results)

model_comparison_table %>%
  gt() %>%
  tab_header(
    title = "LPA Model Comparison",
    subtitle = "Information Criteria for Model Selection"
  ) %>%
  tab_spanner(
    label = "Information Criteria",
    columns = c(AIC, BIC, SABIC)
  ) %>%
  fmt_number(
    columns = c(AIC, BIC, SABIC, Entropy),
    decimals = 2
  ) %>%
  tab_style(
    style = list(
      cell_fill(color = "lightblue"),
      cell_text(weight = "bold")
    ),
    locations = cells_body(
      rows = which.min(model_comparison_table$BIC)
    )
  ) %>%
  tab_footnote(
    footnote = "Highlighted row indicates best-fitting model based on BIC",
    locations = cells_column_labels(columns = BIC)
  ) %>%
  cols_align(align = "center")
LPA Model Comparison
Information Criteria for Model Selection
Profiles ModelType Information Criteria Entropy
AIC BIC1 SABIC
5 6 59,729.73 60,068.52 59,912.80 0.75
5 3 71,296.63 71,469.48 71,390.04 0.75
5 1 71,364.03 71,516.14 71,446.23 0.74
3 3 72,016.60 72,134.14 72,080.12 0.83
4 1 72,179.79 72,304.24 72,247.04 0.62
4 3 72,179.02 72,324.22 72,257.49 0.66
3 1 72,365.61 72,462.41 72,417.92 0.78
1 Highlighted row indicates best-fitting model based on BIC
# Select best model based on BIC
best_model_info <- select_best_model(model_results)
best_model <- best_model_info$model
n_profiles_selected <- best_model_info$n_profiles
model_type_selected <- best_model_info$model_type
  • Selected Model: 6 with 5 profiles
  • BIC: 6.006852^{4}
  • Entropy: 0.754

4 Profile Analysis and Interpretation

# Extract profile means and create summary table
profile_means <- get_profile_means(best_model, profile_vars, profile_var_labels)

profile_means %>%
  gt() %>%
  tab_header(
    title = paste("Profile Means for", n_profiles_selected, "Profile Solution"),
    subtitle = "Mean values across profiling variables by latent profile"
  ) %>%
  fmt_number(
    columns = all_of(profile_vars),
    decimals = 3
  ) %>%
  data_color(
    columns = all_of(profile_vars),
    colors = scales::col_numeric(
      palette = c("lightblue", "white", "lightcoral"),
      domain = NULL
    )
  ) %>%
  cols_align(align = "center") %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  cols_label(.list = setNames(as.list(profile_var_labels[profile_vars]), profile_vars))
Profile Means for 5 Profile Solution
Mean values across profiling variables by latent profile
Profile Cognitive Behavioral Social
Profile 1 4.046 1.540 4.447
Profile 2 4.058 1.041 3.477
Profile 3 3.310 3.240 4.399
Profile 4 4.954 1.184 2.171
Profile 5 1.790 1.160 4.205
# Profile sizes and proportions
profile_assignments <- get_profile_assignments(best_model)
profile_sizes <- create_profile_size_table(profile_assignments)

profile_sizes %>%
  gt() %>%
  tab_header(
    title = "Profile Membership Distribution",
    subtitle = "Sample sizes and proportions by profile"
  ) %>%
  fmt_number(
    columns = Proportion,
    decimals = 3
  ) %>%
  fmt_percent(
    columns = Percentage,
    decimals = 1,
    scale_values = FALSE  # your column already has 21.5, 10.9, etc.
  ) %>%
  cols_align(columns = everything(), align = "center") %>%
  grand_summary_rows(
    columns = N,
    fns = list(Total = ~sum(., na.rm = TRUE)),
    fmt = ~fmt_number(.)   # <- use `fmt`, not `formatter`
  )
Profile Membership Distribution
Sample sizes and proportions by profile
Profile N Proportion Percentage
Profile 1 1362 0.183 18.3%
Profile 2 3347 0.450 45.0%
Profile 3 1256 0.169 16.9%
Profile 4 459 0.062 6.2%
Profile 5 1012 0.136 13.6%
Total 7,436.00
# Create profile plot
profile_plot <- create_profile_plot(profile_means, profile_vars, profile_var_labels, 
                                   profile_colors, analysis_title)
print(profile_plot)

# Generate automated interpretation
interpretation_text <- interpret_profiles(profile_means, profile_vars, profile_var_labels, 
                                         var_direction)

4.1 Profile Interpretation

4.1.1 Profile 1

  • Shows high levels of Cognitive (M = 4.046)
  • Shows moderate levels of Behavioral (M = 1.54)
  • Shows high levels of Social (M = 4.447)

Overall characterization: Moderate profile - average or mixed levels across dimensions

4.1.2 Profile 2

  • Shows high levels of Cognitive (M = 4.058)
  • Shows low levels of Behavioral (M = 1.041)
  • Shows low levels of Social (M = 3.477)

Overall characterization: Moderate profile - average or mixed levels across dimensions

4.1.3 Profile 3

  • Shows low levels of Cognitive (M = 3.31)
  • Shows high levels of Behavioral (M = 3.24)
  • Shows high levels of Social (M = 4.399)

Overall characterization: High-scoring profile - elevated across multiple dimensions

4.1.4 Profile 4

  • Shows high levels of Cognitive (M = 4.954)
  • Shows low levels of Behavioral (M = 1.184)
  • Shows low levels of Social (M = 2.171)

Overall characterization: Low-scoring profile - reduced levels across multiple dimensions

4.1.5 Profile 5

  • Shows low levels of Cognitive (M = 1.79)
  • Shows low levels of Behavioral (M = 1.16)
  • Shows high levels of Social (M = 4.205)

Overall characterization: Low-scoring profile - reduced levels across multiple dimensions


5 Correlation Analysis with Binary Variables

These two tables work together to provide both statistical evidence and practical understanding of profile differences. The correlation table reveals the strength and statistical significance of relationships, while the proportion table shows the actual rates within each group.

The Profile-Binary Variable Correlations table displays point-biserial correlations between being in each profile and having each binary characteristic. These correlation coefficients range from -1.00 to +1.00, where positive correlations indicate that being in a particular profile makes it more likely to have that characteristic, while negative correlations mean being in that profile makes it less likely to have the characteristic. The magnitude of these correlations helps interpret the strength of relationships: values between 0.10-0.29 represent small effects, 0.30-0.49 indicate medium effects, and values of 0.50 or higher suggest large effects. Each correlation coefficient is accompanied by a p-value that indicates statistical significance. P-values less than 0.05 appear highlighted in green and represent statistically significant relationships.

# Merge profile assignments with binary variables  
correlation_data <- merge_profile_binary_data(analysis_data, profile_assignments, binary_vars)

# Use actual number of profiles (not theoretical)
actual_n_profiles <- length(unique(profile_assignments$Class))

# Calculate correlations using actual number
correlation_results <- calculate_profile_correlations(correlation_data, actual_n_profiles, 
                                                     binary_vars, binary_var_labels)
# Create correlation table
correlation_results$correlation_table %>%
  gt() %>%
  tab_header(
    title = "Profile-Binary Variable Correlations",
    subtitle = "Point-biserial correlations between profile membership and binary indicators"
  ) %>%
  fmt_number(
    columns = ends_with(c("_r", "_p")),
    decimals = 3
  ) %>%
  data_color(
    columns = ends_with("_p"),
    colors = function(x) ifelse(x < alpha_level, "lightgreen", "white"),
    alpha = 0.8
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      columns = ends_with("_p"),
      rows = which(correlation_results$correlation_table %>% 
                   select(ends_with("_p")) %>% 
                   apply(1, function(x) any(x < alpha_level)))
    )
  ) %>%
  cols_align(align = "center") %>%
  tab_footnote(
    footnote = paste("Green highlighting indicates p <", alpha_level),
    locations = cells_column_labels(columns = ends_with("_p"))
  ) %>%
  cols_label(Variable = "Binary Variable")
Profile-Binary Variable Correlations
Point-biserial correlations between profile membership and binary indicators
Binary Variable Profile_1_r Profile_1_p1 Profile_2_r Profile_2_p1 Profile_3_r Profile_3_p1 Profile_4_r Profile_4_p1 Profile_5_r Profile_5_p1
ER1 0.327 0.000 −0.404 0.000 0.418 0.000 −0.059 0.000 −0.199 0.000
ER2 0.197 0.000 −0.238 0.000 0.269 0.000 −0.046 0.000 −0.139 0.000
ER3 0.067 0.000 −0.121 0.000 0.162 0.000 −0.034 0.003 −0.053 0.000
1 Green highlighting indicates p < 0.05

The Binary Variable Proportions by Profile table complements the correlation analysis by showing the actual percentage of people in each profile who possess each characteristic (where the binary variable equals 1). This table provides concrete, interpretable percentages that help understand the practical meaning of the correlations. For instance, the proportion table reveals that 92.0% of Profile 1 members actually have this characteristic compared to 0.0% of Profile 2 members. Similarly, for ER1, 93.4% of Profile 1 members have this characteristic while only 28.6% of Profile 2 members do.

# Analyze proportions of binary variables by profile
proportion_results <- analyze_binary_proportions(correlation_data, n_profiles_selected, 
                                                binary_vars, binary_var_labels)

proportion_results %>%
  gt() %>%
  tab_header(
    title = "Binary Variable Proportions by Profile",
    subtitle = "Proportion of individuals with each characteristic within each profile"
  ) %>%
  fmt_percent(
    columns = starts_with("Profile"),
    decimals = 1
  ) %>%
  data_color(
    columns = starts_with("Profile"),
    colors = scales::col_numeric(
      palette = c("white", "darkblue"),
      domain = c(0, 1)
    )
  ) %>%
  cols_align(align = "center") %>%
  cols_label(Variable = "Binary Variable")
Binary Variable Proportions by Profile
Proportion of individuals with each characteristic within each profile
Binary Variable Profile_1 Profile_2 Profile_3 Profile_4 Profile_5
ER1 88.2% 31.5% 100.0% 42.3% 28.8%
ER2 26.1% 3.8% 32.1% 6.5% 0.9%
ER3 3.6% 0.0% 6.4% 0.0% 0.0%
# Create correlation heatmap
correlation_plot <- create_correlation_heatmap(correlation_results$correlation_matrix, 
                                             profile_colors, n_profiles_selected,
                                             binary_var_labels)
print(correlation_plot)

# Generate correlation interpretation
correlation_interpretation <- interpret_correlations(correlation_results, alpha_level, 
                                                    n_profiles_selected, binary_var_labels)

5.1 Correlation Interpretation

5.1.1 Profile 1 Associations

  • moderately positively associated with ER1 (r = 0.327, p = 0)
  • weakly positively associated with ER2 (r = 0.197, p = 0)
  • very weakly positively associated with ER3 (r = 0.067, p = 0)

5.1.2 Profile 2 Associations

  • moderately negatively associated with ER1 (r = -0.404, p = 0)
  • weakly negatively associated with ER2 (r = -0.238, p = 0)
  • weakly negatively associated with ER3 (r = -0.121, p = 0)

5.1.3 Profile 3 Associations

  • moderately positively associated with ER1 (r = 0.418, p = 0)
  • weakly positively associated with ER2 (r = 0.269, p = 0)
  • weakly positively associated with ER3 (r = 0.162, p = 0)

5.1.4 Profile 4 Associations

  • very weakly negatively associated with ER1 (r = -0.059, p = 0)
  • very weakly negatively associated with ER2 (r = -0.046, p = 0)
  • very weakly negatively associated with ER3 (r = -0.034, p = 0.003)

5.1.5 Profile 5 Associations

  • weakly negatively associated with ER1 (r = -0.199, p = 0)
  • weakly negatively associated with ER2 (r = -0.139, p = 0)
  • very weakly negatively associated with ER3 (r = -0.053, p = 0)

6 Summary and Conclusions

# Create LPA Summary Table combining multiple result tables
af_create_lpa_summary_table <- function(include_correlations = TRUE) {
  # Get profile numbers from the actual results
  n_profiles_actual <- length(unique(profile_assignments$Class))
  
  # Initialize summary table with profile information
  summary_table <- profile_sizes %>%
    select(Profile, N) %>%
    arrange(Profile)
  
  # Add profile means (Cognitive, Behavioral, Social)
  means_data <- profile_means %>%
    select(Profile, all_of(profile_vars)) %>%
    arrange(Profile)
  
  summary_table <- summary_table %>%
    left_join(means_data, by = "Profile")
  
  # Conditionally add correlation data for ER1, ER2, ER3 with significance asterisks
  if(include_correlations) {
    er_vars <- c("ER1", "ER2", "ER3")
    
    for(i in 1:n_profiles_actual) {
      profile_name <- paste0("Profile_", i)
      
      for(er_var in er_vars) {
        # Get correlation coefficient and p-value
        r_col <- paste0(profile_name, "_r")
        p_col <- paste0(profile_name, "_p")
        
        if(r_col %in% names(correlation_results$correlation_table) && 
           er_var %in% correlation_results$correlation_table$Variable) {
          
          r_val <- correlation_results$correlation_table %>%
            filter(Variable == er_var) %>%
            pull(!!sym(r_col))
          
          p_val <- correlation_results$correlation_table %>%
            filter(Variable == er_var) %>%
            pull(!!sym(p_col))
          
          # Format with asterisks for significance
          asterisks <- case_when(
            p_val < 0.001 ~ "***",
            p_val < 0.01 ~ "**", 
            p_val < 0.05 ~ "*",
            TRUE ~ ""
          )
          
          corr_formatted <- paste0(sprintf("%.3f", r_val), asterisks)
          
          # Add to summary table
          corr_col_name <- paste0(er_var, "_r")
          summary_table[i, corr_col_name] <- corr_formatted
        }
      }
    }
  }
  
  # Add proportion data for ER1, ER2, ER3
  er_vars <- c("ER1", "ER2", "ER3")
  for(er_var in er_vars) {
    if(er_var %in% proportion_results$Variable) {
      prop_data <- proportion_results %>%
        filter(Variable == er_var) %>%
        select(-Variable) %>%
        t() %>%
        as.data.frame() %>%
        rownames_to_column("Profile") %>%
        setNames(c("Profile", paste0(er_var, "_prop"))) %>%
        mutate(Profile = gsub("Profile_", "Profile ", Profile)) %>%
        arrange(Profile)
      
      summary_table <- summary_table %>%
        left_join(prop_data, by = "Profile")
    }
  }
  
  return(list(table = summary_table, include_correlations = include_correlations))
}

# Set flag for including correlations (change to FALSE to omit)
include_correlations_flag <- FALSE

# Generate the summary table
lpa_summary_result <- af_create_lpa_summary_table(include_correlations = include_correlations_flag)
lpa_summary <- lpa_summary_result$table

# Create the formatted table with gt
formatted_table <- lpa_summary %>%
  gt() %>%
  tab_header(
    title = "LPA Summary Table",
    subtitle = "Comprehensive profile characteristics and associations"
  ) %>%
  # Add column spanners for grouping
  tab_spanner(
    label = "Profile Size",
    columns = c(N)
  ) %>%
  tab_spanner(
    label = "Profile Means",
    columns = all_of(profile_vars)
  )

# Conditionally add correlation spanner and columns
if(lpa_summary_result$include_correlations) {
  formatted_table <- formatted_table %>%
    tab_spanner(
      label = "Correlations with Binary Variables",
      columns = c(ER1_r, ER2_r, ER3_r)
    )
}

# Add proportions spanner
formatted_table <- formatted_table %>%
  tab_spanner(
    label = "Proportions with Binary Characteristics",
    columns = c(ER1_prop, ER2_prop, ER3_prop)
  ) %>%
  # Format numeric columns
  fmt_number(
    columns = all_of(profile_vars),
    decimals = 3
  ) %>%
  fmt_percent(
    columns = c(ER1_prop, ER2_prop, ER3_prop),
    decimals = 1
  )

# Create column labels list
col_labels <- setNames(as.list(profile_var_labels[profile_vars]), profile_vars)
col_labels <- c(col_labels, list(ER1_prop = "ER1", ER2_prop = "ER2", ER3_prop = "ER3"))

if(lpa_summary_result$include_correlations) {
  col_labels <- c(col_labels, list(ER1_r = "ER1", ER2_r = "ER2", ER3_r = "ER3"))
}

formatted_table <- formatted_table %>%
  cols_label(.list = col_labels) %>%
  # Center align most columns
  cols_align(
    align = "center",
    columns = -c(Profile)
  )

# Conditionally add correlation-specific footnotes
if(lpa_summary_result$include_correlations) {
  formatted_table <- formatted_table %>%
    tab_footnote(
      footnote = "* p < 0.05, ** p < 0.01, *** p < 0.001",
      locations = cells_column_spanners(spanners = "Correlations with Binary Variables")
    ) %>%
    tab_footnote(
      footnote = "Point-biserial correlations between profile membership and binary indicators",
      locations = cells_column_spanners(spanners = "Correlations with Binary Variables")
    )
}

# Add proportions footnote and styling
formatted_table <- formatted_table %>%
  tab_footnote(
    footnote = "Percentage of individuals within each profile having the characteristic",
    locations = cells_column_spanners(spanners = "Proportions with Binary Characteristics")
  ) %>%
  # Style the table
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_spanners()
  )

# Display the table
formatted_table
LPA Summary Table
Comprehensive profile characteristics and associations
Profile Profile Size Profile Means Proportions with Binary Characteristics1
N Cognitive Behavioral Social ER1 ER2 ER3
Profile 1 1362 4.046 1.540 4.447 88.2% 26.1% 3.6%
Profile 2 3347 4.058 1.041 3.477 31.5% 3.8% 0.0%
Profile 3 1256 3.310 3.240 4.399 100.0% 32.1% 6.4%
Profile 4 459 4.954 1.184 2.171 42.3% 6.5% 0.0%
Profile 5 1012 1.790 1.160 4.205 28.8% 0.9% 0.0%
1 Percentage of individuals within each profile having the characteristic
# Create final summary
final_summary <- create_final_summary(best_model_info, profile_means, 
                                     correlation_results, alpha_level,
                                     profile_var_labels, binary_var_labels)

6.1 Key Findings

This LPA analysis identified 5 distinct profiles based on the following variables: Cognitive, Behavioral, Social with good classification quality (Entropy = 0.754). The profiles show meaningful differentiation across the analyzed dimensions.

6.1.1 Correlation Analysis Results

  • Binary variables analyzed: ER1, ER2, ER3
  • Total correlation tests: 15
  • Significant associations (p < 0.05): 15
  • Proportion significant: 100%

6.1.2 Methodological Notes

  • Model selection based on Bayesian Information Criterion (BIC)
  • Point-biserial correlations used for profile-binary variable associations
  • All analyses conducted on cases with complete data on profile variables
  • Results should be interpreted within the context of the specific sample and measures used

6.2 Model Quality Assessment

# Additional model diagnostics
entropy_value <- best_model_info$entropy
min_profile_size <- min(profile_sizes$N)
max_profile_size <- max(profile_sizes$N)

if(entropy_value > 0.8) {
  classification_quality <- "Good (entropy > 0.8)"
} else if(entropy_value > 0.6) {
  classification_quality <- "Acceptable (entropy > 0.6)"
} else {
  classification_quality <- "Poor (entropy < 0.6) - consider fewer profiles"
}

profile_size_warning <- NULL
if(min_profile_size/nrow(analysis_data) < 0.05) {
  profile_size_warning <- "- **Warning**: Smallest profile contains < 5% of sample. Consider fewer profiles"
}
  • Entropy: 0.754 (values > 0.8 indicate good classification quality)
  • Smallest profile: 459 individuals (6.2%)
  • Largest profile: 3347 individuals (45%)
  • Classification quality: Acceptable (entropy > 0.6)