# Load required libraries
library(tidyverse)
library(tidyLPA)
library(gt)
library(ggplot2)
library(corrr)
library(plotly)
library(RColorBrewer)
library(scales)
library(patchwork)
library(afcommon)
source("af_common_add_ons.R")
# Source helper functions
source("af_gauge_indices.R")
source("af_lpa.R")
This analysis aims to identify distinct latent profiles within our sample based on multiple continuous variables. Following profile identification, we examine how these profiles correlate with various binary indicators to understand the broader behavioral and attitudinal patterns associated with different profiles.
Latent Profile Analysis (LPA) is a person-centered statistical approach that identifies unobserved subgroups (latent classes) within a population based on patterns of responses across multiple continuous variables. Unlike variable-centered approaches that examine relationships between variables, LPA focuses on identifying distinct groups of individuals who share similar response patterns.
# =============================================================================
# LPA CONFIGURATION SECTION - MODIFY ALL PARAMETERS HERE
# =============================================================================
# Profile variables (continuous indicators for LPA)
profile_vars <- c("pe_ideology", "pe_violence", "pe_intolerance")
# Variable labels for display (must match order of profile_vars)
profile_var_labels <- c("Cognitive", "Behavioral", "Social")
# Binary correlation variables
binary_vars <- c("i_er1", "i_er2", "i_er3")
# Binary variable labels for display (must match order of binary_vars)
binary_var_labels <- c("ER1", "ER2", "ER3")
# Number of profiles to test (range)
min_profiles <- 3
max_profiles <- 5
# Model specifications to test (tidyLPA models)
models_to_test <- c(1, 2, 3, 6) # c(1, 2, 3, 6)
# 1=equal variances/equal covariances,
# 2=varying variances/equal covariances,
# 3=equal variances/varying covariances,
# 6=varying variances/varying covariances
# Analysis title and subtitle for reports
analysis_title <- "Latent Profile Analysis Results"
analysis_subtitle <- "Identification and characterization of latent profiles"
# Variable interpretation direction (TRUE = higher values are "more extreme/higher")
# Set to FALSE if lower values represent higher levels of the construct
var_direction <- c(TRUE, TRUE, TRUE) # Must match length of profile_vars
# Significance level for interpretations
alpha_level <- 0.05
# Color palette for visualizations
profile_colors <- viridis::viridis(max_profiles)
# Random seed for reproducibility
set.seed(123)
# Create named vectors for easier reference
names(profile_var_labels) <- profile_vars
names(binary_var_labels) <- binary_vars
names(var_direction) <- profile_vars
# Load and prepare data
df <- as.data.frame(readRDS("Israel Survey/data/il_pe.RDS"))
indices <- af_gauge_indices(df, pop_var1 = "Wave", comm_var1 = "pe_left_center_right",
threshold_type = "MAD", k_factor = 1.5)
# Analyze data of one wave
data <- indices$df # %>% filter(Wave == "First")
# Validate data structure
data_is_valid <- validate_data(data, profile_vars, binary_vars)
# Prepare analysis data
required_vars <- c(profile_vars, binary_vars)
missing_vars <- setdiff(required_vars, names(data))
if(length(missing_vars) > 0) {
stop(paste("Missing variables:", paste(missing_vars, collapse = ", ")))
}
# Remove cases with missing values on profile variables
complete_cases <- complete.cases(data[profile_vars])
analysis_data <- data[complete_cases, ]
# Fit multiple LPA models
model_results <- fit_lpa_models(analysis_data, profile_vars,
min_profiles, max_profiles, models_to_test)
# Create model comparison table
model_comparison_table <- create_model_comparison_table(model_results)
model_comparison_table %>%
gt() %>%
tab_header(
title = "LPA Model Comparison",
subtitle = "Information Criteria for Model Selection"
) %>%
tab_spanner(
label = "Information Criteria",
columns = c(AIC, BIC, SABIC)
) %>%
fmt_number(
columns = c(AIC, BIC, SABIC, Entropy),
decimals = 2
) %>%
tab_style(
style = list(
cell_fill(color = "lightblue"),
cell_text(weight = "bold")
),
locations = cells_body(
rows = which.min(model_comparison_table$BIC)
)
) %>%
tab_footnote(
footnote = "Highlighted row indicates best-fitting model based on BIC",
locations = cells_column_labels(columns = BIC)
) %>%
cols_align(align = "center")
LPA Model Comparison | |||||
Information Criteria for Model Selection | |||||
Profiles | ModelType | Information Criteria | Entropy | ||
---|---|---|---|---|---|
AIC | BIC1 | SABIC | |||
5 | 6 | 59,729.73 | 60,068.52 | 59,912.80 | 0.75 |
5 | 3 | 71,296.63 | 71,469.48 | 71,390.04 | 0.75 |
5 | 1 | 71,364.03 | 71,516.14 | 71,446.23 | 0.74 |
3 | 3 | 72,016.60 | 72,134.14 | 72,080.12 | 0.83 |
4 | 1 | 72,179.79 | 72,304.24 | 72,247.04 | 0.62 |
4 | 3 | 72,179.02 | 72,324.22 | 72,257.49 | 0.66 |
3 | 1 | 72,365.61 | 72,462.41 | 72,417.92 | 0.78 |
1 Highlighted row indicates best-fitting model based on BIC |
# Select best model based on BIC
best_model_info <- select_best_model(model_results)
best_model <- best_model_info$model
n_profiles_selected <- best_model_info$n_profiles
model_type_selected <- best_model_info$model_type
# Extract profile means and create summary table
profile_means <- get_profile_means(best_model, profile_vars, profile_var_labels)
profile_means %>%
gt() %>%
tab_header(
title = paste("Profile Means for", n_profiles_selected, "Profile Solution"),
subtitle = "Mean values across profiling variables by latent profile"
) %>%
fmt_number(
columns = all_of(profile_vars),
decimals = 3
) %>%
data_color(
columns = all_of(profile_vars),
colors = scales::col_numeric(
palette = c("lightblue", "white", "lightcoral"),
domain = NULL
)
) %>%
cols_align(align = "center") %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
cols_label(.list = setNames(as.list(profile_var_labels[profile_vars]), profile_vars))
Profile Means for 5 Profile Solution | |||
Mean values across profiling variables by latent profile | |||
Profile | Cognitive | Behavioral | Social |
---|---|---|---|
Profile 1 | 4.046 | 1.540 | 4.447 |
Profile 2 | 4.058 | 1.041 | 3.477 |
Profile 3 | 3.310 | 3.240 | 4.399 |
Profile 4 | 4.954 | 1.184 | 2.171 |
Profile 5 | 1.790 | 1.160 | 4.205 |
# Profile sizes and proportions
profile_assignments <- get_profile_assignments(best_model)
profile_sizes <- create_profile_size_table(profile_assignments)
profile_sizes %>%
gt() %>%
tab_header(
title = "Profile Membership Distribution",
subtitle = "Sample sizes and proportions by profile"
) %>%
fmt_number(
columns = Proportion,
decimals = 3
) %>%
fmt_percent(
columns = Percentage,
decimals = 1,
scale_values = FALSE # your column already has 21.5, 10.9, etc.
) %>%
cols_align(columns = everything(), align = "center") %>%
grand_summary_rows(
columns = N,
fns = list(Total = ~sum(., na.rm = TRUE)),
fmt = ~fmt_number(.) # <- use `fmt`, not `formatter`
)
Profile Membership Distribution | ||||
Sample sizes and proportions by profile | ||||
Profile | N | Proportion | Percentage | |
---|---|---|---|---|
Profile 1 | 1362 | 0.183 | 18.3% | |
Profile 2 | 3347 | 0.450 | 45.0% | |
Profile 3 | 1256 | 0.169 | 16.9% | |
Profile 4 | 459 | 0.062 | 6.2% | |
Profile 5 | 1012 | 0.136 | 13.6% | |
Total | — | 7,436.00 | — | — |
# Create profile plot
profile_plot <- create_profile_plot(profile_means, profile_vars, profile_var_labels,
profile_colors, analysis_title)
print(profile_plot)
# Generate automated interpretation
interpretation_text <- interpret_profiles(profile_means, profile_vars, profile_var_labels,
var_direction)
Overall characterization: Moderate profile - average or mixed levels across dimensions
Overall characterization: Moderate profile - average or mixed levels across dimensions
Overall characterization: High-scoring profile - elevated across multiple dimensions
Overall characterization: Low-scoring profile - reduced levels across multiple dimensions
Overall characterization: Low-scoring profile - reduced levels across multiple dimensions
These two tables work together to provide both statistical evidence and practical understanding of profile differences. The correlation table reveals the strength and statistical significance of relationships, while the proportion table shows the actual rates within each group.
The Profile-Binary Variable Correlations table displays point-biserial correlations between being in each profile and having each binary characteristic. These correlation coefficients range from -1.00 to +1.00, where positive correlations indicate that being in a particular profile makes it more likely to have that characteristic, while negative correlations mean being in that profile makes it less likely to have the characteristic. The magnitude of these correlations helps interpret the strength of relationships: values between 0.10-0.29 represent small effects, 0.30-0.49 indicate medium effects, and values of 0.50 or higher suggest large effects. Each correlation coefficient is accompanied by a p-value that indicates statistical significance. P-values less than 0.05 appear highlighted in green and represent statistically significant relationships.
# Merge profile assignments with binary variables
correlation_data <- merge_profile_binary_data(analysis_data, profile_assignments, binary_vars)
# Use actual number of profiles (not theoretical)
actual_n_profiles <- length(unique(profile_assignments$Class))
# Calculate correlations using actual number
correlation_results <- calculate_profile_correlations(correlation_data, actual_n_profiles,
binary_vars, binary_var_labels)
# Create correlation table
correlation_results$correlation_table %>%
gt() %>%
tab_header(
title = "Profile-Binary Variable Correlations",
subtitle = "Point-biserial correlations between profile membership and binary indicators"
) %>%
fmt_number(
columns = ends_with(c("_r", "_p")),
decimals = 3
) %>%
data_color(
columns = ends_with("_p"),
colors = function(x) ifelse(x < alpha_level, "lightgreen", "white"),
alpha = 0.8
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(
columns = ends_with("_p"),
rows = which(correlation_results$correlation_table %>%
select(ends_with("_p")) %>%
apply(1, function(x) any(x < alpha_level)))
)
) %>%
cols_align(align = "center") %>%
tab_footnote(
footnote = paste("Green highlighting indicates p <", alpha_level),
locations = cells_column_labels(columns = ends_with("_p"))
) %>%
cols_label(Variable = "Binary Variable")
Profile-Binary Variable Correlations | ||||||||||
Point-biserial correlations between profile membership and binary indicators | ||||||||||
Binary Variable | Profile_1_r | Profile_1_p1 | Profile_2_r | Profile_2_p1 | Profile_3_r | Profile_3_p1 | Profile_4_r | Profile_4_p1 | Profile_5_r | Profile_5_p1 |
---|---|---|---|---|---|---|---|---|---|---|
ER1 | 0.327 | 0.000 | −0.404 | 0.000 | 0.418 | 0.000 | −0.059 | 0.000 | −0.199 | 0.000 |
ER2 | 0.197 | 0.000 | −0.238 | 0.000 | 0.269 | 0.000 | −0.046 | 0.000 | −0.139 | 0.000 |
ER3 | 0.067 | 0.000 | −0.121 | 0.000 | 0.162 | 0.000 | −0.034 | 0.003 | −0.053 | 0.000 |
1 Green highlighting indicates p < 0.05 |
The Binary Variable Proportions by Profile table complements the correlation analysis by showing the actual percentage of people in each profile who possess each characteristic (where the binary variable equals 1). This table provides concrete, interpretable percentages that help understand the practical meaning of the correlations. For instance, the proportion table reveals that 92.0% of Profile 1 members actually have this characteristic compared to 0.0% of Profile 2 members. Similarly, for ER1, 93.4% of Profile 1 members have this characteristic while only 28.6% of Profile 2 members do.
# Analyze proportions of binary variables by profile
proportion_results <- analyze_binary_proportions(correlation_data, n_profiles_selected,
binary_vars, binary_var_labels)
proportion_results %>%
gt() %>%
tab_header(
title = "Binary Variable Proportions by Profile",
subtitle = "Proportion of individuals with each characteristic within each profile"
) %>%
fmt_percent(
columns = starts_with("Profile"),
decimals = 1
) %>%
data_color(
columns = starts_with("Profile"),
colors = scales::col_numeric(
palette = c("white", "darkblue"),
domain = c(0, 1)
)
) %>%
cols_align(align = "center") %>%
cols_label(Variable = "Binary Variable")
Binary Variable Proportions by Profile | |||||
Proportion of individuals with each characteristic within each profile | |||||
Binary Variable | Profile_1 | Profile_2 | Profile_3 | Profile_4 | Profile_5 |
---|---|---|---|---|---|
ER1 | 88.2% | 31.5% | 100.0% | 42.3% | 28.8% |
ER2 | 26.1% | 3.8% | 32.1% | 6.5% | 0.9% |
ER3 | 3.6% | 0.0% | 6.4% | 0.0% | 0.0% |
# Create correlation heatmap
correlation_plot <- create_correlation_heatmap(correlation_results$correlation_matrix,
profile_colors, n_profiles_selected,
binary_var_labels)
print(correlation_plot)
# Generate correlation interpretation
correlation_interpretation <- interpret_correlations(correlation_results, alpha_level,
n_profiles_selected, binary_var_labels)
# Create LPA Summary Table combining multiple result tables
af_create_lpa_summary_table <- function(include_correlations = TRUE) {
# Get profile numbers from the actual results
n_profiles_actual <- length(unique(profile_assignments$Class))
# Initialize summary table with profile information
summary_table <- profile_sizes %>%
select(Profile, N) %>%
arrange(Profile)
# Add profile means (Cognitive, Behavioral, Social)
means_data <- profile_means %>%
select(Profile, all_of(profile_vars)) %>%
arrange(Profile)
summary_table <- summary_table %>%
left_join(means_data, by = "Profile")
# Conditionally add correlation data for ER1, ER2, ER3 with significance asterisks
if(include_correlations) {
er_vars <- c("ER1", "ER2", "ER3")
for(i in 1:n_profiles_actual) {
profile_name <- paste0("Profile_", i)
for(er_var in er_vars) {
# Get correlation coefficient and p-value
r_col <- paste0(profile_name, "_r")
p_col <- paste0(profile_name, "_p")
if(r_col %in% names(correlation_results$correlation_table) &&
er_var %in% correlation_results$correlation_table$Variable) {
r_val <- correlation_results$correlation_table %>%
filter(Variable == er_var) %>%
pull(!!sym(r_col))
p_val <- correlation_results$correlation_table %>%
filter(Variable == er_var) %>%
pull(!!sym(p_col))
# Format with asterisks for significance
asterisks <- case_when(
p_val < 0.001 ~ "***",
p_val < 0.01 ~ "**",
p_val < 0.05 ~ "*",
TRUE ~ ""
)
corr_formatted <- paste0(sprintf("%.3f", r_val), asterisks)
# Add to summary table
corr_col_name <- paste0(er_var, "_r")
summary_table[i, corr_col_name] <- corr_formatted
}
}
}
}
# Add proportion data for ER1, ER2, ER3
er_vars <- c("ER1", "ER2", "ER3")
for(er_var in er_vars) {
if(er_var %in% proportion_results$Variable) {
prop_data <- proportion_results %>%
filter(Variable == er_var) %>%
select(-Variable) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Profile") %>%
setNames(c("Profile", paste0(er_var, "_prop"))) %>%
mutate(Profile = gsub("Profile_", "Profile ", Profile)) %>%
arrange(Profile)
summary_table <- summary_table %>%
left_join(prop_data, by = "Profile")
}
}
return(list(table = summary_table, include_correlations = include_correlations))
}
# Set flag for including correlations (change to FALSE to omit)
include_correlations_flag <- FALSE
# Generate the summary table
lpa_summary_result <- af_create_lpa_summary_table(include_correlations = include_correlations_flag)
lpa_summary <- lpa_summary_result$table
# Create the formatted table with gt
formatted_table <- lpa_summary %>%
gt() %>%
tab_header(
title = "LPA Summary Table",
subtitle = "Comprehensive profile characteristics and associations"
) %>%
# Add column spanners for grouping
tab_spanner(
label = "Profile Size",
columns = c(N)
) %>%
tab_spanner(
label = "Profile Means",
columns = all_of(profile_vars)
)
# Conditionally add correlation spanner and columns
if(lpa_summary_result$include_correlations) {
formatted_table <- formatted_table %>%
tab_spanner(
label = "Correlations with Binary Variables",
columns = c(ER1_r, ER2_r, ER3_r)
)
}
# Add proportions spanner
formatted_table <- formatted_table %>%
tab_spanner(
label = "Proportions with Binary Characteristics",
columns = c(ER1_prop, ER2_prop, ER3_prop)
) %>%
# Format numeric columns
fmt_number(
columns = all_of(profile_vars),
decimals = 3
) %>%
fmt_percent(
columns = c(ER1_prop, ER2_prop, ER3_prop),
decimals = 1
)
# Create column labels list
col_labels <- setNames(as.list(profile_var_labels[profile_vars]), profile_vars)
col_labels <- c(col_labels, list(ER1_prop = "ER1", ER2_prop = "ER2", ER3_prop = "ER3"))
if(lpa_summary_result$include_correlations) {
col_labels <- c(col_labels, list(ER1_r = "ER1", ER2_r = "ER2", ER3_r = "ER3"))
}
formatted_table <- formatted_table %>%
cols_label(.list = col_labels) %>%
# Center align most columns
cols_align(
align = "center",
columns = -c(Profile)
)
# Conditionally add correlation-specific footnotes
if(lpa_summary_result$include_correlations) {
formatted_table <- formatted_table %>%
tab_footnote(
footnote = "* p < 0.05, ** p < 0.01, *** p < 0.001",
locations = cells_column_spanners(spanners = "Correlations with Binary Variables")
) %>%
tab_footnote(
footnote = "Point-biserial correlations between profile membership and binary indicators",
locations = cells_column_spanners(spanners = "Correlations with Binary Variables")
)
}
# Add proportions footnote and styling
formatted_table <- formatted_table %>%
tab_footnote(
footnote = "Percentage of individuals within each profile having the characteristic",
locations = cells_column_spanners(spanners = "Proportions with Binary Characteristics")
) %>%
# Style the table
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_spanners()
)
# Display the table
formatted_table
LPA Summary Table | |||||||
Comprehensive profile characteristics and associations | |||||||
Profile | Profile Size | Profile Means | Proportions with Binary Characteristics1 | ||||
---|---|---|---|---|---|---|---|
N | Cognitive | Behavioral | Social | ER1 | ER2 | ER3 | |
Profile 1 | 1362 | 4.046 | 1.540 | 4.447 | 88.2% | 26.1% | 3.6% |
Profile 2 | 3347 | 4.058 | 1.041 | 3.477 | 31.5% | 3.8% | 0.0% |
Profile 3 | 1256 | 3.310 | 3.240 | 4.399 | 100.0% | 32.1% | 6.4% |
Profile 4 | 459 | 4.954 | 1.184 | 2.171 | 42.3% | 6.5% | 0.0% |
Profile 5 | 1012 | 1.790 | 1.160 | 4.205 | 28.8% | 0.9% | 0.0% |
1 Percentage of individuals within each profile having the characteristic |
# Create final summary
final_summary <- create_final_summary(best_model_info, profile_means,
correlation_results, alpha_level,
profile_var_labels, binary_var_labels)
This LPA analysis identified 5 distinct profiles based on the following variables: Cognitive, Behavioral, Social with good classification quality (Entropy = 0.754). The profiles show meaningful differentiation across the analyzed dimensions.
# Additional model diagnostics
entropy_value <- best_model_info$entropy
min_profile_size <- min(profile_sizes$N)
max_profile_size <- max(profile_sizes$N)
if(entropy_value > 0.8) {
classification_quality <- "Good (entropy > 0.8)"
} else if(entropy_value > 0.6) {
classification_quality <- "Acceptable (entropy > 0.6)"
} else {
classification_quality <- "Poor (entropy < 0.6) - consider fewer profiles"
}
profile_size_warning <- NULL
if(min_profile_size/nrow(analysis_data) < 0.05) {
profile_size_warning <- "- **Warning**: Smallest profile contains < 5% of sample. Consider fewer profiles"
}