# Load required libraries
library(tidyverse)
library(tidyLPA)
library(gt)
library(ggplot2)
library(corrr)
library(plotly)
library(RColorBrewer)
library(scales)
library(patchwork)

library(afcommon)
source("af_common_add_ons.R")

# Source helper functions
source("af_gauge_indices.R")
source("af_lpa.R")

1 Introduction

1.1 Research Objective

This analysis aims to identify distinct latent profiles within our sample based on multiple continuous variables. Following profile identification, we examine how these profiles correlate with various binary indicators to understand the broader behavioral and attitudinal patterns associated with different profiles.

1.2 What is Latent Profile Analysis?

Latent Profile Analysis (LPA) is a person-centered statistical approach that identifies unobserved subgroups (latent classes) within a population based on patterns of responses across multiple continuous variables. Unlike variable-centered approaches that examine relationships between variables, LPA focuses on identifying distinct groups of individuals who share similar response patterns.

1.2.1 Key Benefits of LPA:

Person-centered approach: Groups individuals rather than examining variable relationships
Heterogeneity detection: Identifies subpopulations that may be masked in aggregate analyses
Profile interpretation: Provides meaningful, interpretable group characteristics
Probabilistic classification: Accounts for uncertainty in group membership
Model comparison: Statistical criteria help determine optimal number of profiles

1.2.2 LPA Process:

Model specification: Define the number of profiles and model constraints
Model fitting: Estimate parameters using maximum likelihood estimation
Model selection: Compare models using information criteria (AIC, BIC, SABIC)
Profile interpretation: Examine mean differences across profiles
Classification: Assign individuals to most likely profile membership

2 LPA Configuration

# =============================================================================
# LPA CONFIGURATION SECTION - MODIFY ALL PARAMETERS HERE
# =============================================================================

# Profile variables (continuous indicators for LPA)
profile_vars <- c("pe_ideology", "pe_violence", "pe_intolerance")

# Variable labels for display (must match order of profile_vars)
profile_var_labels <- c("Cognitive", "Behavioral", "Social")

# Binary correlation variables  
binary_vars <- c("i_er1", "i_er2", "i_er3")

# Binary variable labels for display (must match order of binary_vars)
binary_var_labels <- c("ER1", "ER2", "ER3")

# Number of profiles to test (range)
min_profiles <- 3
max_profiles <- 5

# Model specifications to test (tidyLPA models)
models_to_test <- c(1, 2, 3, 6)  # c(1, 2, 3, 6)
                                 # 1=equal variances/equal covariances, 
                                 # 2=varying variances/equal covariances,
                                 # 3=equal variances/varying covariances, 
                                 # 6=varying variances/varying covariances

# Analysis title and subtitle for reports
analysis_title <- "Latent Profile Analysis Results"
analysis_subtitle <- "Identification and characterization of latent profiles"

# Variable interpretation direction (TRUE = higher values are "more extreme/higher")
# Set to FALSE if lower values represent higher levels of the construct
var_direction <- c(TRUE, TRUE, TRUE)  # Must match length of profile_vars

# Significance level for interpretations
alpha_level <- 0.05

# Color palette for visualizations
profile_colors <- viridis::viridis(max_profiles)

# Random seed for reproducibility
set.seed(123)

# Create named vectors for easier reference
names(profile_var_labels) <- profile_vars
names(binary_var_labels) <- binary_vars
names(var_direction) <- profile_vars

# Load and prepare data
df <- as.data.frame(readRDS("Israel Survey/data/il_pe.RDS"))
indices <- af_gauge_indices(df, pop_var1 = "Wave", comm_var1 = "pe_left_center_right", 
                            threshold_type = "MAD", k_factor = 1.5)

# Analyze data of one wave 
data <- indices$df # %>% filter(Wave == "First")

# Validate data structure
data_is_valid <- validate_data(data, profile_vars, binary_vars)

# Prepare analysis data
required_vars <- c(profile_vars, binary_vars)
missing_vars <- setdiff(required_vars, names(data))
if(length(missing_vars) > 0) {
  stop(paste("Missing variables:", paste(missing_vars, collapse = ", ")))
}

# Remove cases with missing values on profile variables
complete_cases <- complete.cases(data[profile_vars])
analysis_data <- data[complete_cases, ]

Original sample size: 7436
Analysis sample size:” 7436
Cases removed due to missing data: 0
Profile variables:” pe_ideology, pe_violence, pe_intolerance
Binary variables: i_er1, i_er2, i_er3

3 LPA Model Selection and Fitting

# Fit multiple LPA models
model_results <- fit_lpa_models(analysis_data, profile_vars, 
                                min_profiles, max_profiles, models_to_test)

# Create model comparison table
model_comparison_table <- create_model_comparison_table(model_results)

model_comparison_table %>%
  gt() %>%
  tab_header(
    title = "LPA Model Comparison",
    subtitle = "Information Criteria for Model Selection"
  ) %>%
  tab_spanner(
    label = "Information Criteria",
    columns = c(AIC, BIC, SABIC)
  ) %>%
  fmt_number(
    columns = c(AIC, BIC, SABIC, Entropy),
    decimals = 2
  ) %>%
  tab_style(
    style = list(
      cell_fill(color = "lightblue"),
      cell_text(weight = "bold")
    ),
    locations = cells_body(
      rows = which.min(model_comparison_table$BIC)
    )
  ) %>%
  tab_footnote(
    footnote = "Highlighted row indicates best-fitting model based on BIC",
    locations = cells_column_labels(columns = BIC)
  ) %>%
  cols_align(align = "center")

Profiles	ModelType	Information Criteria			Entropy
LPA Model Comparison
Information Criteria for Model Selection
Profiles	ModelType	AIC	BIC¹	SABIC	Entropy
5	6	59,729.73	60,068.52	59,912.80	0.75
5	3	71,296.63	71,469.48	71,390.04	0.75
5	1	71,364.03	71,516.14	71,446.23	0.74
3	3	72,016.60	72,134.14	72,080.12	0.83
4	1	72,179.79	72,304.24	72,247.04	0.62
4	3	72,179.02	72,324.22	72,257.49	0.66
3	1	72,365.61	72,462.41	72,417.92	0.78
¹ Highlighted row indicates best-fitting model based on BIC

# Select best model based on BIC
best_model_info <- select_best_model(model_results)
best_model <- best_model_info$model
n_profiles_selected <- best_model_info$n_profiles
model_type_selected <- best_model_info$model_type

Selected Model: 6 with 5 profiles
BIC: 6.006852^{4}
Entropy: 0.754

4 Profile Analysis and Interpretation

# Extract profile means and create summary table
profile_means <- get_profile_means(best_model, profile_vars, profile_var_labels)

profile_means %>%
  gt() %>%
  tab_header(
    title = paste("Profile Means for", n_profiles_selected, "Profile Solution"),
    subtitle = "Mean values across profiling variables by latent profile"
  ) %>%
  fmt_number(
    columns = all_of(profile_vars),
    decimals = 3
  ) %>%
  data_color(
    columns = all_of(profile_vars),
    colors = scales::col_numeric(
      palette = c("lightblue", "white", "lightcoral"),
      domain = NULL
    )
  ) %>%
  cols_align(align = "center") %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  cols_label(.list = setNames(as.list(profile_var_labels[profile_vars]), profile_vars))

Profile	Cognitive	Behavioral	Social
Profile Means for 5 Profile Solution
Mean values across profiling variables by latent profile
Profile 1	4.046	1.540	4.447
Profile 2	4.058	1.041	3.477
Profile 3	3.310	3.240	4.399
Profile 4	4.954	1.184	2.171
Profile 5	1.790	1.160	4.205

# Profile sizes and proportions
profile_assignments <- get_profile_assignments(best_model)
profile_sizes <- create_profile_size_table(profile_assignments)

profile_sizes %>%
  gt() %>%
  tab_header(
    title = "Profile Membership Distribution",
    subtitle = "Sample sizes and proportions by profile"
  ) %>%
  fmt_number(
    columns = Proportion,
    decimals = 3
  ) %>%
  fmt_percent(
    columns = Percentage,
    decimals = 1,
    scale_values = FALSE  # your column already has 21.5, 10.9, etc.
  ) %>%
  cols_align(columns = everything(), align = "center") %>%
  grand_summary_rows(
    columns = N,
    fns = list(Total = ~sum(., na.rm = TRUE)),
    fmt = ~fmt_number(.)   # <- use `fmt`, not `formatter`
  )

	Profile	N	Proportion	Percentage
Profile Membership Distribution
Sample sizes and proportions by profile
	Profile 1	1362	0.183	18.3%
	Profile 2	3347	0.450	45.0%
	Profile 3	1256	0.169	16.9%
	Profile 4	459	0.062	6.2%
	Profile 5	1012	0.136	13.6%
Total	—	7,436.00	—	—

# Create profile plot
profile_plot <- create_profile_plot(profile_means, profile_vars, profile_var_labels, 
                                   profile_colors, analysis_title)
print(profile_plot)

# Generate automated interpretation
interpretation_text <- interpret_profiles(profile_means, profile_vars, profile_var_labels, 
                                         var_direction)

4.1 Profile Interpretation

4.1.1 Profile 1

Shows high levels of Cognitive (M = 4.046)
Shows moderate levels of Behavioral (M = 1.54)
Shows high levels of Social (M = 4.447)

Overall characterization: Moderate profile - average or mixed levels across dimensions

4.1.2 Profile 2

Shows high levels of Cognitive (M = 4.058)
Shows low levels of Behavioral (M = 1.041)
Shows low levels of Social (M = 3.477)

Overall characterization: Moderate profile - average or mixed levels across dimensions

4.1.3 Profile 3

Shows low levels of Cognitive (M = 3.31)
Shows high levels of Behavioral (M = 3.24)
Shows high levels of Social (M = 4.399)

Overall characterization: High-scoring profile - elevated across multiple dimensions

4.1.4 Profile 4

Shows high levels of Cognitive (M = 4.954)
Shows low levels of Behavioral (M = 1.184)
Shows low levels of Social (M = 2.171)

Overall characterization: Low-scoring profile - reduced levels across multiple dimensions

4.1.5 Profile 5

Shows low levels of Cognitive (M = 1.79)
Shows low levels of Behavioral (M = 1.16)
Shows high levels of Social (M = 4.205)

Overall characterization: Low-scoring profile - reduced levels across multiple dimensions

5 Correlation Analysis with Binary Variables

These two tables work together to provide both statistical evidence and practical understanding of profile differences. The correlation table reveals the strength and statistical significance of relationships, while the proportion table shows the actual rates within each group.

The Profile-Binary Variable Correlations table displays point-biserial correlations between being in each profile and having each binary characteristic. These correlation coefficients range from -1.00 to +1.00, where positive correlations indicate that being in a particular profile makes it more likely to have that characteristic, while negative correlations mean being in that profile makes it less likely to have the characteristic. The magnitude of these correlations helps interpret the strength of relationships: values between 0.10-0.29 represent small effects, 0.30-0.49 indicate medium effects, and values of 0.50 or higher suggest large effects. Each correlation coefficient is accompanied by a p-value that indicates statistical significance. P-values less than 0.05 appear highlighted in green and represent statistically significant relationships.

# Merge profile assignments with binary variables  
correlation_data <- merge_profile_binary_data(analysis_data, profile_assignments, binary_vars)

# Use actual number of profiles (not theoretical)
actual_n_profiles <- length(unique(profile_assignments$Class))

# Calculate correlations using actual number
correlation_results <- calculate_profile_correlations(correlation_data, actual_n_profiles, 
                                                     binary_vars, binary_var_labels)
# Create correlation table
correlation_results$correlation_table %>%
  gt() %>%
  tab_header(
    title = "Profile-Binary Variable Correlations",
    subtitle = "Point-biserial correlations between profile membership and binary indicators"
  ) %>%
  fmt_number(
    columns = ends_with(c("_r", "_p")),
    decimals = 3
  ) %>%
  data_color(
    columns = ends_with("_p"),
    colors = function(x) ifelse(x < alpha_level, "lightgreen", "white"),
    alpha = 0.8
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      columns = ends_with("_p"),
      rows = which(correlation_results$correlation_table %>% 
                   select(ends_with("_p")) %>% 
                   apply(1, function(x) any(x < alpha_level)))
    )
  ) %>%
  cols_align(align = "center") %>%
  tab_footnote(
    footnote = paste("Green highlighting indicates p <", alpha_level),
    locations = cells_column_labels(columns = ends_with("_p"))
  ) %>%
  cols_label(Variable = "Binary Variable")

Binary Variable	Profile_1_r	Profile_1_p¹	Profile_2_r	Profile_2_p¹	Profile_3_r	Profile_3_p¹	Profile_4_r	Profile_4_p¹	Profile_5_r	Profile_5_p¹
Profile-Binary Variable Correlations
Point-biserial correlations between profile membership and binary indicators
ER1	0.327	0.000	−0.404	0.000	0.418	0.000	−0.059	0.000	−0.199	0.000
ER2	0.197	0.000	−0.238	0.000	0.269	0.000	−0.046	0.000	−0.139	0.000
ER3	0.067	0.000	−0.121	0.000	0.162	0.000	−0.034	0.003	−0.053	0.000
¹ Green highlighting indicates p < 0.05

The Binary Variable Proportions by Profile table complements the correlation analysis by showing the actual percentage of people in each profile who possess each characteristic (where the binary variable equals 1). This table provides concrete, interpretable percentages that help understand the practical meaning of the correlations. For instance, the proportion table reveals that 92.0% of Profile 1 members actually have this characteristic compared to 0.0% of Profile 2 members. Similarly, for ER1, 93.4% of Profile 1 members have this characteristic while only 28.6% of Profile 2 members do.

# Analyze proportions of binary variables by profile
proportion_results <- analyze_binary_proportions(correlation_data, n_profiles_selected, 
                                                binary_vars, binary_var_labels)

proportion_results %>%
  gt() %>%
  tab_header(
    title = "Binary Variable Proportions by Profile",
    subtitle = "Proportion of individuals with each characteristic within each profile"
  ) %>%
  fmt_percent(
    columns = starts_with("Profile"),
    decimals = 1
  ) %>%
  data_color(
    columns = starts_with("Profile"),
    colors = scales::col_numeric(
      palette = c("white", "darkblue"),
      domain = c(0, 1)
    )
  ) %>%
  cols_align(align = "center") %>%
  cols_label(Variable = "Binary Variable")

Binary Variable	Profile_1	Profile_2	Profile_3	Profile_4	Profile_5
Binary Variable Proportions by Profile
Proportion of individuals with each characteristic within each profile
ER1	88.2%	31.5%	100.0%	42.3%	28.8%
ER2	26.1%	3.8%	32.1%	6.5%	0.9%
ER3	3.6%	0.0%	6.4%	0.0%	0.0%

# Create correlation heatmap
correlation_plot <- create_correlation_heatmap(correlation_results$correlation_matrix, 
                                             profile_colors, n_profiles_selected,
                                             binary_var_labels)
print(correlation_plot)

# Generate correlation interpretation
correlation_interpretation <- interpret_correlations(correlation_results, alpha_level, 
                                                    n_profiles_selected, binary_var_labels)

5.1 Correlation Interpretation

5.1.1 Profile 1 Associations

moderately positively associated with ER1 (r = 0.327, p = 0)
weakly positively associated with ER2 (r = 0.197, p = 0)
very weakly positively associated with ER3 (r = 0.067, p = 0)

5.1.2 Profile 2 Associations

moderately negatively associated with ER1 (r = -0.404, p = 0)
weakly negatively associated with ER2 (r = -0.238, p = 0)
weakly negatively associated with ER3 (r = -0.121, p = 0)

5.1.3 Profile 3 Associations

moderately positively associated with ER1 (r = 0.418, p = 0)
weakly positively associated with ER2 (r = 0.269, p = 0)
weakly positively associated with ER3 (r = 0.162, p = 0)

5.1.4 Profile 4 Associations

very weakly negatively associated with ER1 (r = -0.059, p = 0)
very weakly negatively associated with ER2 (r = -0.046, p = 0)
very weakly negatively associated with ER3 (r = -0.034, p = 0.003)

5.1.5 Profile 5 Associations

weakly negatively associated with ER1 (r = -0.199, p = 0)
weakly negatively associated with ER2 (r = -0.139, p = 0)
very weakly negatively associated with ER3 (r = -0.053, p = 0)

6 Summary and Conclusions

# Create LPA Summary Table combining multiple result tables
af_create_lpa_summary_table <- function(include_correlations = TRUE) {
  # Get profile numbers from the actual results
  n_profiles_actual <- length(unique(profile_assignments$Class))
  
  # Initialize summary table with profile information
  summary_table <- profile_sizes %>%
    select(Profile, N) %>%
    arrange(Profile)
  
  # Add profile means (Cognitive, Behavioral, Social)
  means_data <- profile_means %>%
    select(Profile, all_of(profile_vars)) %>%
    arrange(Profile)
  
  summary_table <- summary_table %>%
    left_join(means_data, by = "Profile")
  
  # Conditionally add correlation data for ER1, ER2, ER3 with significance asterisks
  if(include_correlations) {
    er_vars <- c("ER1", "ER2", "ER3")
    
    for(i in 1:n_profiles_actual) {
      profile_name <- paste0("Profile_", i)
      
      for(er_var in er_vars) {
        # Get correlation coefficient and p-value
        r_col <- paste0(profile_name, "_r")
        p_col <- paste0(profile_name, "_p")
        
        if(r_col %in% names(correlation_results$correlation_table) && 
           er_var %in% correlation_results$correlation_table$Variable) {
          
          r_val <- correlation_results$correlation_table %>%
            filter(Variable == er_var) %>%
            pull(!!sym(r_col))
          
          p_val <- correlation_results$correlation_table %>%
            filter(Variable == er_var) %>%
            pull(!!sym(p_col))
          
          # Format with asterisks for significance
          asterisks <- case_when(
            p_val < 0.001 ~ "***",
            p_val < 0.01 ~ "**", 
            p_val < 0.05 ~ "*",
            TRUE ~ ""
          )
          
          corr_formatted <- paste0(sprintf("%.3f", r_val), asterisks)
          
          # Add to summary table
          corr_col_name <- paste0(er_var, "_r")
          summary_table[i, corr_col_name] <- corr_formatted
        }
      }
    }
  }
  
  # Add proportion data for ER1, ER2, ER3
  er_vars <- c("ER1", "ER2", "ER3")
  for(er_var in er_vars) {
    if(er_var %in% proportion_results$Variable) {
      prop_data <- proportion_results %>%
        filter(Variable == er_var) %>%
        select(-Variable) %>%
        t() %>%
        as.data.frame() %>%
        rownames_to_column("Profile") %>%
        setNames(c("Profile", paste0(er_var, "_prop"))) %>%
        mutate(Profile = gsub("Profile_", "Profile ", Profile)) %>%
        arrange(Profile)
      
      summary_table <- summary_table %>%
        left_join(prop_data, by = "Profile")
    }
  }
  
  return(list(table = summary_table, include_correlations = include_correlations))
}

# Set flag for including correlations (change to FALSE to omit)
include_correlations_flag <- FALSE

# Generate the summary table
lpa_summary_result <- af_create_lpa_summary_table(include_correlations = include_correlations_flag)
lpa_summary <- lpa_summary_result$table

# Create the formatted table with gt
formatted_table <- lpa_summary %>%
  gt() %>%
  tab_header(
    title = "LPA Summary Table",
    subtitle = "Comprehensive profile characteristics and associations"
  ) %>%
  # Add column spanners for grouping
  tab_spanner(
    label = "Profile Size",
    columns = c(N)
  ) %>%
  tab_spanner(
    label = "Profile Means",
    columns = all_of(profile_vars)
  )

# Conditionally add correlation spanner and columns
if(lpa_summary_result$include_correlations) {
  formatted_table <- formatted_table %>%
    tab_spanner(
      label = "Correlations with Binary Variables",
      columns = c(ER1_r, ER2_r, ER3_r)
    )
}

# Add proportions spanner
formatted_table <- formatted_table %>%
  tab_spanner(
    label = "Proportions with Binary Characteristics",
    columns = c(ER1_prop, ER2_prop, ER3_prop)
  ) %>%
  # Format numeric columns
  fmt_number(
    columns = all_of(profile_vars),
    decimals = 3
  ) %>%
  fmt_percent(
    columns = c(ER1_prop, ER2_prop, ER3_prop),
    decimals = 1
  )

# Create column labels list
col_labels <- setNames(as.list(profile_var_labels[profile_vars]), profile_vars)
col_labels <- c(col_labels, list(ER1_prop = "ER1", ER2_prop = "ER2", ER3_prop = "ER3"))

if(lpa_summary_result$include_correlations) {
  col_labels <- c(col_labels, list(ER1_r = "ER1", ER2_r = "ER2", ER3_r = "ER3"))
}

formatted_table <- formatted_table %>%
  cols_label(.list = col_labels) %>%
  # Center align most columns
  cols_align(
    align = "center",
    columns = -c(Profile)
  )

# Conditionally add correlation-specific footnotes
if(lpa_summary_result$include_correlations) {
  formatted_table <- formatted_table %>%
    tab_footnote(
      footnote = "* p < 0.05, ** p < 0.01, *** p < 0.001",
      locations = cells_column_spanners(spanners = "Correlations with Binary Variables")
    ) %>%
    tab_footnote(
      footnote = "Point-biserial correlations between profile membership and binary indicators",
      locations = cells_column_spanners(spanners = "Correlations with Binary Variables")
    )
}

# Add proportions footnote and styling
formatted_table <- formatted_table %>%
  tab_footnote(
    footnote = "Percentage of individuals within each profile having the characteristic",
    locations = cells_column_spanners(spanners = "Proportions with Binary Characteristics")
  ) %>%
  # Style the table
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_spanners()
  )

# Display the table
formatted_table

Profile	Profile Size	Profile Means			Proportions with Binary Characteristics¹
LPA Summary Table
Comprehensive profile characteristics and associations
Profile	N	Cognitive	Behavioral	Social	ER1	ER2	ER3
Profile 1	1362	4.046	1.540	4.447	88.2%	26.1%	3.6%
Profile 2	3347	4.058	1.041	3.477	31.5%	3.8%	0.0%
Profile 3	1256	3.310	3.240	4.399	100.0%	32.1%	6.4%
Profile 4	459	4.954	1.184	2.171	42.3%	6.5%	0.0%
Profile 5	1012	1.790	1.160	4.205	28.8%	0.9%	0.0%
¹ Percentage of individuals within each profile having the characteristic

# Create final summary
final_summary <- create_final_summary(best_model_info, profile_means, 
                                     correlation_results, alpha_level,
                                     profile_var_labels, binary_var_labels)

6.1 Key Findings

This LPA analysis identified 5 distinct profiles based on the following variables: Cognitive, Behavioral, Social with good classification quality (Entropy = 0.754). The profiles show meaningful differentiation across the analyzed dimensions.

6.1.1 Correlation Analysis Results

Binary variables analyzed: ER1, ER2, ER3
Total correlation tests: 15
Significant associations (p < 0.05): 15
Proportion significant: 100%

6.1.2 Methodological Notes

Model selection based on Bayesian Information Criterion (BIC)
Point-biserial correlations used for profile-binary variable associations
All analyses conducted on cases with complete data on profile variables
Results should be interpreted within the context of the specific sample and measures used

6.2 Model Quality Assessment

# Additional model diagnostics
entropy_value <- best_model_info$entropy
min_profile_size <- min(profile_sizes$N)
max_profile_size <- max(profile_sizes$N)

if(entropy_value > 0.8) {
  classification_quality <- "Good (entropy > 0.8)"
} else if(entropy_value > 0.6) {
  classification_quality <- "Acceptable (entropy > 0.6)"
} else {
  classification_quality <- "Poor (entropy < 0.6) - consider fewer profiles"
}

profile_size_warning <- NULL
if(min_profile_size/nrow(analysis_data) < 0.05) {
  profile_size_warning <- "- **Warning**: Smallest profile contains < 5% of sample. Consider fewer profiles"
}

Entropy: 0.754 (values > 0.8 indicate good classification quality)
Smallest profile: 459 individuals (6.2%)
Largest profile: 3347 individuals (45%)
Classification quality: Acceptable (entropy > 0.6)

Latent Profile Analysis

IL Political Extremism

Amir Freund

2025-08-29