library(tidyverse)
library(psych)
library(naniar)
library(readxl)
library(janitor)
library(skimr)
library(Hmisc)
library(paran)
library(R.utils)
library(writexl)
library(mice)
library(kableExtra)

Introduction

This Supplementary Material documents the robust principal component analysis (PCA) workflow used to derive the IMuSSA composite structure. We provide a fully reproducible pipeline that: (i) preprocesses the municipal indicator set, including descriptive summaries and missing-data diagnostics; (ii) evaluates the missingness mechanism using Little’s MCAR test; (iii) performs multiple imputation via MICE followed by z-score standardization; (iv) determines the number of components through parallel analysis based on Spearman correlations; (v) implements iterative, listwise PCA with ProMax rotation and explicit rules for item retention (loading thresholds, cross-loadings, weak components); and (vi) assesses the internal consistency of the final rotated components using reliability coefficients. Together, these steps provide a transparent audit trail for the dimensionality-reduction strategy underlying the IMuSSA index.

1. Central Configuration

This section defines the central configuration object that governs all subsequent analyses. We specify the indicator set used in the PCA, the rotation method (ProMax), and the decision rules for item retention (minimum loading, cross-loading tolerance, and minimum number of items per component), as well as the tuning parameters for parallel analysis and MICE imputation. The same configuration also sets directory paths and a fixed random seed, ensuring that all outputs are reproducible and internally consistent across the Supplementary Material.

CFG <- list(
  BASE_DIR  = ".",
  DATA_FILE = "./database/basePCAcompleta.xlsx",
  DIR_OUT   = "./output",

  # Variables
  indicators = c(
    "b3","b4","b5","c2","c3","d1","f1","f2","h1","h2","i1",
    "ias2","ias4","ias6","ias7","ias8","ias9","ig4","ig8",
    "ig51","ig52","ig53","ii1","ii3","ii5","ii6","ii10","ii11",
    "im1","n1","n2","n3","o6","p1","pa1","pa3","q1","r2",
    "s1","s2","v3","v4","v6","w1","w2","x1","z4"
  ),

  # PCA and rotation
  PROMAX_M = 4,

  # Decision rules
  carga_min      = 0.30,
  cross_delta    = 0.10,
  min_items_comp = 3,
  corr_thr       = 0.70,

  # Parallel analysis
  ap_iter     = 6000,
  ap_centile  = 95,
  ap_max_rows = 5570,

  # MICE
  mice_m      = 5,
  mice_maxit  = 20,
  mice_method = "pmm",

  # Reproducibility
  SEED = 35
)

DIR_OUT <- CFG$DIR_OUT
dir.create(DIR_OUT, recursive = TRUE, showWarnings = FALSE)
set.seed(CFG$SEED)

2. Utility functions

This section defines auxiliary functions used throughout the PCA workflow. They standardize numerical outputs, automate Excel export, enforce the correct loadings structure, and provide a robust wrapper for ProMax rotation, ensuring stable and consistent computation.

# Round numeric columns to 6 decimals before writing
round_df6 <- function(df) {
  df <- as.data.frame(df)
  num_cols <- vapply(df, is.numeric, logical(1))
  if (any(num_cols)) {
    df[num_cols] <- lapply(df[num_cols], round, 6)
  }
  df
}

write_xlsx_6 <- function(path, sheets_named_list) {
  dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
  out <- lapply(sheets_named_list, round_df6)
  writexl::write_xlsx(out, path = path)
}

to_loadings <- function(x) {
  x <- as.matrix(x)
  class(x) <- "loadings"
  x
}

promax_rotate <- function(L_unrot, m = CFG$PROMAX_M) {
  pro <- try(psych::Promax(L_unrot, m = m, normalize = TRUE), silent = TRUE)
  if (inherits(pro, "try-error") || is.null(pro$loadings)) {
    pro <- try(psych::Promax(L_unrot, m = m), silent = TRUE)
  }
  if (!inherits(pro, "try-error") && !is.null(pro$loadings)) {
    Lrot <- pro$loadings
    Phi  <- if (!is.null(pro$Phi)) pro$Phi else diag(ncol(Lrot))
    return(list(L = to_loadings(Lrot), Phi = Phi))
  }
  spro <- stats::promax(to_loadings(L_unrot), m = m)
  Lrot <- if (!is.null(spro$loadings)) spro$loadings else spro
  list(L = to_loadings(Lrot), Phi = diag(ncol(as.matrix(Lrot))))
}

3. Data preparation and missing-data diagnostics

This section prepares the IMuSSA indicator dataset for PCA and characterizes its empirical properties. We construct cleaned numeric variables, report summary statistics and distributional plots, map missing-data patterns, and examine pairwise Spearman correlations to flag highly collinear indicators. We then implement a robust version of Little’s MCAR test, with automated pruning of near-collinear variables, to assess whether missingness can be treated as approximately random for subsequent analyses.

raw <- readxl::read_excel(CFG$DATA_FILE, sheet = "Planilha1") %>%
  janitor::clean_names()

data0 <- raw %>%
  dplyr::select(dplyr::all_of(CFG$indicators)) %>%
  dplyr::mutate(dplyr::across(dplyr::everything(), as.numeric))

dir_desc <- file.path(DIR_OUT, "01_descriptive")
dir.create(dir_desc, recursive = TRUE, showWarnings = FALSE)

descr_psych <- psych::describe(data0) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("variable")

write_xlsx_6(
  file.path(dir_desc, "descriptive_psych.xlsx"),
  list(psych_describe = descr_psych)
)

descr_psych_print <- descr_psych %>%
  dplyr::select(
    Variable = variable,
    N = n,
    Mean = mean,
    SD = sd,
    Median = median,
    Min = min,
    Max = max,
    Skewness = skew,
    Kurtosis = kurtosis
  )

kbl(
  descr_psych_print,
  caption = "Summary statistics for indicators",
  align = "lcccccccc",
  digits = 2,
  escape = FALSE
) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center",
    font_size = 12
  )

Summary statistics for indicators
Variable	N	Mean	SD	Median	Min	Max	Skewness	Kurtosis
b3	5558	7.19	8.33	4.11	0.00	74.43	1.73	4.26
b4	5558	6.16	7.22	3.82	0.00	87.50	2.40	9.25
b5	5563	0.91	13.00	0.00	0.00	912.00	62.76	4342.62
c2	5570	0.11	0.13	0.06	0.00	1.00	2.18	5.79
c3	5570	0.09	0.13	0.04	0.00	1.00	2.54	8.32
d1	5502	0.42	0.28	0.38	0.00	1.00	0.44	-0.54
f1	5570	0.27	0.15	0.26	0.00	0.86	0.44	-0.29
f2	5570	0.07	0.07	0.04	0.00	0.51	1.72	3.60
h1	5570	0.23	0.14	0.21	0.00	0.79	0.64	-0.08
h2	5570	0.06	0.05	0.05	0.00	0.46	1.74	4.71
i1	5570	0.46	0.16	0.44	0.00	1.00	0.45	0.05
ias2	5570	0.39	0.19	0.38	0.00	1.00	0.39	0.30
ias4	5570	0.11	0.13	0.05	0.00	0.93	1.98	4.64
ias6	5570	0.19	0.20	0.11	0.00	1.00	1.14	0.40
ias7	5570	0.15	0.20	0.04	0.00	0.98	1.64	1.96
ias8	5570	0.68	0.30	0.70	0.00	1.00	-0.35	-1.24
ias9	5570	0.07	0.36	0.00	0.00	7.00	8.55	99.96
ig4	5570	0.02	0.15	0.00	0.00	1.00	6.42	39.21
ig8	5563	0.09	0.10	0.06	0.00	0.77	2.20	6.05
ig51	5563	0.04	0.07	0.01	0.00	1.00	4.42	28.44
ig52	5563	0.05	0.10	0.01	0.00	1.00	3.19	13.43
ig53	5563	0.04	0.07	0.01	0.00	0.76	3.97	23.13
ii1	5553	0.24	0.17	0.20	0.00	0.98	1.19	1.27
ii3	5312	0.71	0.25	0.76	0.00	1.00	-0.64	-0.51
ii5	5557	0.84	0.16	0.90	0.00	1.00	-2.01	4.44
ii6	5523	0.24	0.17	0.22	0.00	1.00	0.61	-0.27
ii10	5482	0.39	0.23	0.37	0.00	1.00	0.43	-0.41
ii11	5570	0.82	0.20	0.88	0.00	1.00	-1.76	3.25
im1	5570	0.33	0.20	0.29	0.00	1.00	0.45	-0.69
n1	5556	0.34	0.32	0.21	0.00	1.00	0.82	-0.75
n2	5556	0.43	0.30	0.43	0.00	1.00	0.10	-1.30
n3	5570	0.81	0.17	0.86	0.00	1.00	-1.70	3.27
o6	4900	0.53	0.50	1.00	0.00	1.00	-0.10	-1.99
p1	5570	329749.70	1166229.46	92783.00	-1264224.00	35247300.00	14.25	298.86
pa1	5563	0.17	0.15	0.14	0.00	0.82	0.92	0.25
pa3	5493	0.63	0.29	0.68	0.00	1.00	-0.55	-0.76
q1	5570	0.05	0.21	0.00	0.00	1.00	4.34	16.80
r2	5514	0.36	0.23	0.33	0.00	1.00	0.49	-0.74
s1	5570	0.71	0.45	1.00	0.00	1.00	-0.93	-1.13
s2	5570	0.79	0.41	1.00	0.00	1.00	-1.39	-0.07
v3	5563	0.36	0.27	0.30	0.00	1.00	0.56	-0.81
v4	5563	0.02	0.04	0.00	0.00	0.93	7.62	94.49
v6	4999	0.09	0.15	0.03	0.00	1.00	2.85	9.23
w1	5462	0.28	0.45	0.00	0.00	1.00	0.96	-1.08
w2	5462	0.07	0.26	0.00	0.00	1.00	3.33	9.08
x1	5570	4.37	19.32	0.00	0.00	832.00	18.71	647.73
z4	5570	0.12	1.83	0.00	-1.59	124.93	59.62	3933.21

data0_long <- data0 %>%
  dplyr::select(where(is.numeric)) %>%
  tidyr::pivot_longer(
    cols      = dplyr::everything(),
    names_to  = "variable",
    values_to = "value"
  )

p_hist <- ggplot2::ggplot(data0_long, ggplot2::aes(x = value)) +
  ggplot2::geom_histogram(bins = 30) +
  ggplot2::facet_wrap(~ variable, scales = "free") +
  ggplot2::theme_minimal() +
  ggplot2::labs(
    x = NULL, y = "Frequency",
    title = "Distribution of indicators (histograms)"
  )

p_hist

ggplot2::ggsave(
  file.path(dir_desc, "histograms.png"),
  plot  = p_hist, width = 12, height = 10,
  dpi = 600
)

p_box <- ggplot(data0_long, aes(x = value, y = variable)) +
  geom_boxplot(outlier.alpha = 0.3) +
  facet_wrap(~ variable, ncol = 1, scales = "free") +
  theme_minimal() +
  labs(
    x = NULL,
    y = NULL,
    title = "Distribution of indicators (boxplot)"
  ) +
  theme(
    strip.text = element_text(size = 9),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

p_box

ggsave(
  file.path(dir_desc, "boxplots.png"),
  plot  = p_box,
  width = 12, height = 45,
  dpi = 600
)

ggsave(
  file.path(dir_desc, "missing_by_variable.png"),
  plot  = (naniar::gg_miss_var(data0) +
             theme_minimal() +
             labs(
               x = "Indicator",
               y = "# Missing",
               title = "Missingness by indicator")),
  width = 6, height = 6,
  dpi = 600
)

ggsave(
  file.path(dir_desc, "missing_pattern.png"),
  plot = (
    naniar::vis_miss(data0) +
      theme_minimal() +
      coord_flip() +
      scale_y_discrete(position = "left") +
      labs(
        x = "Indicator",
        y = "Observation",
        title = "Missingness pattern"
      ) +
      theme(
        legend.position = "bottom",
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10)
        )
  ),
  width = 8, height = 10,
  dpi = 600
)

knitr::include_graphics(c(
  file.path(DIR_OUT, "01_descriptive", "missing_by_variable.png"),
  file.path(DIR_OUT, "01_descriptive", "missing_pattern.png")
))

dir_corr <- file.path(DIR_OUT, "02-correlation")
dir.create(dir_corr, recursive = TRUE, showWarnings = FALSE)

data_num <- data0 %>%
  as.data.frame() %>%
  dplyr::select(dplyr::where(is.numeric))

if (ncol(data_num) < 2) {
  stop("No numeric columns for correlation.")
}

base_use <- stats::na.omit(data_num)
if (nrow(base_use) < 2) stop("Too few complete cases for correlation.")

rc <- Hmisc::rcorr(as.matrix(base_use), type = "spearman")
R  <- rc$r
P  <- rc$P
N  <- rc$n

R_df <- as.data.frame(R) %>% tibble::rownames_to_column("variable")
P_df <- as.data.frame(P) %>% tibble::rownames_to_column("variable")
N_df <- as.data.frame(N) %>% tibble::rownames_to_column("variable")

thr <- CFG$corr_thr
nm  <- colnames(R)
sel <- which(abs(R) > thr & upper.tri(R), arr.ind = TRUE)

strong_pairs <- if (nrow(sel)) {
  tibble::tibble(
    v1 = nm[sel[, 1]],
    v2 = nm[sel[, 2]],
    r  = as.numeric(R[sel]),
    n  = as.integer(N[cbind(sel[, 1], sel[, 2])]),
    p  = as.numeric(P[cbind(sel[, 1], sel[, 2])])
  ) %>%
    dplyr::arrange(dplyr::desc(abs(r)))
} else {
  tibble::tibble(
    v1 = character(0),
    v2 = character(0),
    r  = numeric(0),
    n  = integer(0),
    p  = numeric(0)
  )
}

if (nrow(strong_pairs) > 0) {

  strong_pairs_print <- strong_pairs %>%
    dplyr::mutate(
      r = round(r, 2),
      p = signif(p, 3)
    ) %>%
    dplyr::rename(
      `Variable 1`        = v1,
      `Variable 2`        = v2,
      `Spearman r`        = r,
      `Complete cases (N)`= n,
      `p-value`           = p
    )

  kbl(
    strong_pairs_print,
    caption = "Pairs of indicators with strong Spearman correlations (listwise deletion)",
    align   = "lccccc",
    escape  = FALSE
  ) %>%
    kable_styling(
      bootstrap_options = c("striped", "hover", "condensed"),
      full_width        = FALSE,
      position          = "center",
      font_size         = 11
    )
} else {
  cat("*No pairs of indicators exceeded the predefined correlation threshold.*\n\n")
}

Pairs of indicators with strong Spearman correlations (listwise deletion)
Variable 1	Variable 2	Spearman r	Complete cases (N)
ig51	ig53	0.87	4062
c2	c3	0.86	4062
f2	h2	0.83	4062
ii11	n3	0.82	4062
n1	n2	-0.82	4062
ig51	ig52	0.81	4062
ig52	ig53	0.81	4062
f1	h1	0.79	4062
b3	im1	-0.72	4062

R_long <- R_df %>%
  tidyr::pivot_longer(-variable, names_to = "var2", values_to = "r") %>%
  dplyr::rename(var1 = variable)

P_long <- P_df %>%
  tidyr::pivot_longer(-variable, names_to = "var2", values_to = "p") %>%
  dplyr::rename(var1 = variable)

var_levels <- R_df$variable

corr_long <- R_long %>%
  dplyr::left_join(P_long, by = c("var1", "var2")) %>%
  dplyr::mutate(
    i = match(var1, var_levels),
    j = match(var2, var_levels),
    tri = dplyr::case_when(
      i <  j ~ "upper",
      i >  j ~ "lower",
      TRUE ~ "diag"
    ),
    sig_cat = dplyr::case_when(
      p < 0.001 ~ "p < 0.001",
      p < 0.01  ~ "p < 0.01",
      p < 0.05  ~ "p < 0.05",
      TRUE      ~ "ns"
    )
  )

upper_df <- corr_long %>%
  dplyr::filter(tri == "upper")

lower_df <- corr_long %>%
  dplyr::filter(tri == "lower", sig_cat != "ns")

ggsave(
  file.path(dir_corr, "correlation_heatmap.png"),
  plot   = (ggplot() +
              geom_tile(
                data = upper_df,
                aes(x = var2, y = var1, fill = r)
                ) +
              scale_fill_gradient2(
                limits = c(-1, 1),
                name   = "Spearman r"
                ) +
              geom_point(
                data = lower_df,
                aes(x = var2, y = var1,
                    shape = sig_cat
                    ),
                alpha = 0.9,
                stroke = 0,
                size   = 3
                ) +
              scale_shape_manual(
                name   = "Significance",
                values = c(
                  "p < 0.05"  = 16,
                  "p < 0.01"  = 17,
                  "p < 0.001" = 15
                  ) 
                ) +
              coord_fixed() +
              theme_minimal() +
              theme(
                axis.text.x  = element_text(angle = 90,
                                            hjust = 1,
                                            vjust = 0.5,
                                            size = 9),
                axis.text.y  = element_text(size = 9),
                legend.box   = "vertical",
                legend.title = element_text(size = 9),
                legend.text  = element_text(size = 9),
                plot.title   = element_text(hjust = 0.5, face = "bold")
                ) +
              labs(
                title = "Spearman correlation: magnitude and significance",
                x = NULL, y = NULL
                )),
  width  = 10,
  height = 10,
  dpi    = 600
  )

knitr::include_graphics(
  file.path(dir_corr, "correlation_heatmap.png")
  )

write_xlsx_6(
  file.path(dir_corr, "correlations_listwise.xlsx"),
  list(
    R = R_df,
    P = P_df,
    N = N_df,
    strong_pairs = strong_pairs
  )
)

data1 <- data0 %>%
  dplyr::select(-c(c2,n3))

dir_mcar <- file.path(DIR_OUT, "03_mcar_little")
dir.create(dir_mcar, recursive = TRUE, showWarnings = FALSE)

X <- data1 %>%
  dplyr::select(dplyr::where(is.numeric)) %>%
  as.data.frame()

thr_colinear <- 0.999
max_drops    <- 30L
dropped      <- character(0)
attempt      <- 0L

repeat {
  attempt <- attempt + 1L
  
  res_try <- try(suppressWarnings(naniar::mcar_test(X)), silent = TRUE)
  
  if (!inherits(res_try, "try-error")) {
    break
  }
  
  msg <- paste(as.character(res_try), collapse = " ")
  
  if (grepl("singular", msg, ignore.case = TRUE) ||
      grepl("system is computationally singular", msg, ignore.case = TRUE)) {
    
    cc <- stats::complete.cases(X)
    if (sum(cc) < 3L) {
      stop("MCAR test: too few complete cases after dropping variables: ",
           paste(dropped, collapse = ", "))
    }
    
    R <- try(stats::cor(X[cc, , drop = FALSE]), silent = TRUE)
    if (inherits(R, "try-error") || !is.matrix(R) || ncol(R) < 2L) {
      stop("MCAR test: could not compute correlation matrix for pruning.")
    }
    
    idx <- which(abs(R) > thr_colinear & upper.tri(R), arr.ind = TRUE)
    sds_all <- vapply(X, function(z) stats::sd(z, na.rm = TRUE), numeric(1))
    sds_all[!is.finite(sds_all)] <- Inf
    
    if (nrow(idx) > 0L) {
      vnames <- colnames(R)
      pairs  <- apply(idx, 1, function(rc) c(vnames[rc[1]], vnames[rc[2]]))
      if (is.matrix(pairs)) pairs <- split(pairs, col(pairs))
      
      victim <- NULL
      for (pair in pairs) {
        a  <- pair[1]
        b  <- pair[2]
        sa <- sds_all[[a]]
        sb <- sds_all[[b]]
        if (!is.finite(sa)) sa <- Inf
        if (!is.finite(sb)) sb <- Inf
        victim <- if (sa <= sb) a else b
        break
      }
    } else {
      victim <- names(sds_all)[which.min(sds_all)]
    }
    
    dropped <- unique(c(dropped, victim))
    X <- X[, setdiff(colnames(X), victim), drop = FALSE]
    
    if (ncol(X) < 2L) {
      stop("MCAR test: <2 variables remaining after pruning: ",
           paste(dropped, collapse = ", "))
    }
    if (attempt >= max_drops) {
      stop("MCAR test: reached max_drops (", max_drops,
           "). Dropped: ", paste(dropped, collapse = ", "))
    }
    
    next
  } else {
    stop("MCAR test failed: ", msg)
  }
}

res_mcar <- res_try

captura <- utils::capture.output(print(res_mcar))
writeLines(captura,
           con = file.path(dir_mcar, "little_mcar_resultado.txt"))

stat <- NA_real_
dfe  <- NA_real_
pval <- NA_real_

if (inherits(res_mcar, "htest")) {
  stat <- tryCatch(unname(res_mcar$statistic[[1]]),
                   error = function(e) NA_real_)
  dfe  <- tryCatch({
    if (!is.null(res_mcar$parameter)) {
      as.numeric(res_mcar$parameter[[1]])
    } else NA_real_
  }, error = function(e) NA_real_)
  pval <- tryCatch(unname(res_mcar$p.value),
                   error = function(e) NA_real_)
} else if (is.list(res_mcar)) {
  stat <- res_mcar$statistic %||% res_mcar$chi.square %||% NA_real_
  dfe  <- res_mcar$df        %||% res_mcar$parameter %||% NA_real_
  pval <- res_mcar$p.value   %||% NA_real_
}

resumo_mcar <- data.frame(
  `Missing Values (%)`          = 100 * mean(is.na(as.matrix(data0))),
  `Number of Variables (Final)` = ncol(X),
  `Number of Variables Removed` = length(dropped),
  `Little MCAR Chi-square`      = stat,
  `Degrees of Freedom`          = dfe,
  `p-value`                     = pval,
  check.names = FALSE
)

utils::write.csv(
  resumo_mcar,
  file = file.path(dir_mcar, "little_mcar_resumo.csv"),
  row.names = FALSE
)

if (length(dropped)) {
  utils::write.csv(
    data.frame(Variable_Removed = dropped),
    file = file.path(dir_mcar, "little_mcar_vars_removidas.csv"),
    row.names = FALSE
  )
}

resumo_mcar_print <- resumo_mcar %>%
  t() %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Statistic") %>%
  dplyr::rename(Value = 2)

kbl(
  resumo_mcar_print,
  caption = "Little's MCAR test",
  digits = 2,
  col.names = c("Statistic", "Value"),
  align = "lc"
) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center",
    font_size = 12
  )

Little’s MCAR test
Statistic	Value
Missing Values (%)	0.84
Number of Variables (Final)	43.00
Number of Variables Removed	2.00
Little MCAR Chi-square	6086.73
Degrees of Freedom	3364.00
p-value	0.00

4. MICE imputation and z-score

This section performs multiple imputation using MICE (m = 5, PMM) to address missing values prior to PCA. We document missingness before and after imputation, adjust the predictor matrix by removing highly collinear variables, generate the completed dataset, and finally standardize all indicators via z-scores for use in parallel analysis and PCA.

dir_mice <- file.path(DIR_OUT, "04_mice")
dir.create(dir_mice, recursive = TRUE, showWarnings = FALSE)

data2 <- as.data.frame(data1)

na_before <- vapply(data2, function(v) sum(is.na(v)), integer(1))
write.csv(
  data.frame(variable = names(na_before), n_na_before = as.integer(na_before)),
  file.path(dir_mice, "mice_na_before.csv"),
  row.names = FALSE
)

pred <- mice::quickpred(
  data2,
  mincor = 0.1,
  minpuc = 0.25
)

num_vars <- vapply(data2, is.numeric, logical(1))

if (sum(num_vars) > 1L) {
  cor_num <- suppressWarnings(
    stats::cor(
      data2[, num_vars, drop = FALSE],
      use   = "pairwise.complete.obs",
      method = "spearman"
    )
  )
  
  cor_num[lower.tri(cor_num, diag = TRUE)] <- 0
  high_pairs <- which(abs(cor_num) > 0.99, arr.ind = TRUE)
  
  if (nrow(high_pairs) > 0L) {
    high_corr_vars <- unique(colnames(cor_num)[high_pairs[, "col"]])
    
    write.csv(
      data.frame(variable = high_corr_vars),
      file.path(dir_mice, "mice_removed_highcorr_as_predictors.csv"),
      row.names = FALSE
    )
    
    cols_to_zero <- intersect(colnames(pred), high_corr_vars)
    if (length(cols_to_zero) > 0L) {
      pred[, cols_to_zero] <- 0L
    }
  }
}

mids <- mice::mice(
  data2,
  m                = CFG$mice_m,
  maxit            = CFG$mice_maxit,
  method           = CFG$mice_method,
  seed             = CFG$SEED,
  predictorMatrix  = pred,
  ridge            = 1e-03,
  printFlag        = FALSE
)

data2_imp <- mice::complete(mids)

na_after <- vapply(data2_imp, function(v) sum(is.na(v)), integer(1))
write.csv(
  data.frame(variable = names(na_after), n_na_after = as.integer(na_after)),
  file.path(dir_mice, "mice_na_after.csv"),
  row.names = FALSE
)

data2_norm <- as.data.frame(scale(data2_imp))

write.csv(
  data2_norm,
  file.path(dir_mice, "mice_data2_norm.csv"),
  row.names = FALSE
)

pred_summary <- data.frame(
  variable     = rownames(pred),
  n_predictors = rowSums(pred),
  has_na       = vapply(data2, function(v) any(is.na(v)), logical(1))
) %>%
  mutate(
    missing_pattern = ifelse(has_na, "Yes", "No"),
    imputation_status = dplyr::case_when(
      has_na & n_predictors > 0  ~ "Imputed (at least one predictor)",
      !has_na & n_predictors == 0 ~ "Fully observed (no imputation required)",
      TRUE ~ "Inconsistent pattern (requires inspection)"
    )
  )

write.csv(
  pred_summary,
  file.path(dir_mice, "mice_predictor_summary.csv"),
  row.names = FALSE
)

pred_summary %>%
  dplyr::select(
    `Number of predictors` = n_predictors,
    `Any missing values`   = missing_pattern,
    `Imputation status`    = imputation_status
  ) %>%
  dplyr::arrange(desc(`Any missing values`), desc(`Number of predictors`)) %>%
  kbl(
    caption   = "Predictor structure used in the MICE imputation",
    booktabs  = TRUE,
    align     = "ccc",
    col.names = c("Number of predictors", "Any missing values", "Imputation status"),
    escape    = TRUE
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "condensed"),
    full_width        = FALSE,
    font_size         = 11
  )

Predictor structure used in the MICE imputation
	Number of predictors	Any missing values	Imputation status
b3	32	Yes	Imputed (at least one predictor)
ii10	31	Yes	Imputed (at least one predictor)
pa3	30	Yes	Imputed (at least one predictor)
n1	29	Yes	Imputed (at least one predictor)
r2	28	Yes	Imputed (at least one predictor)
ii6	24	Yes	Imputed (at least one predictor)
n2	23	Yes	Imputed (at least one predictor)
o6	23	Yes	Imputed (at least one predictor)
ii3	22	Yes	Imputed (at least one predictor)
v6	22	Yes	Imputed (at least one predictor)
ii5	20	Yes	Imputed (at least one predictor)
d1	17	Yes	Imputed (at least one predictor)
v3	15	Yes	Imputed (at least one predictor)
ig8	14	Yes	Imputed (at least one predictor)
ig52	13	Yes	Imputed (at least one predictor)
pa1	12	Yes	Imputed (at least one predictor)
ig51	11	Yes	Imputed (at least one predictor)
ii1	11	Yes	Imputed (at least one predictor)
b4	10	Yes	Imputed (at least one predictor)
ig53	9	Yes	Imputed (at least one predictor)
v4	6	Yes	Imputed (at least one predictor)
w1	5	Yes	Imputed (at least one predictor)
w2	4	Yes	Imputed (at least one predictor)
b5	3	Yes	Imputed (at least one predictor)
c3	0	No	Fully observed (no imputation required)
f1	0	No	Fully observed (no imputation required)
f2	0	No	Fully observed (no imputation required)
h1	0	No	Fully observed (no imputation required)
h2	0	No	Fully observed (no imputation required)
i1	0	No	Fully observed (no imputation required)
ias2	0	No	Fully observed (no imputation required)
ias4	0	No	Fully observed (no imputation required)
ias6	0	No	Fully observed (no imputation required)
ias7	0	No	Fully observed (no imputation required)
ias8	0	No	Fully observed (no imputation required)
ias9	0	No	Fully observed (no imputation required)
ig4	0	No	Fully observed (no imputation required)
ii11	0	No	Fully observed (no imputation required)
im1	0	No	Fully observed (no imputation required)
p1	0	No	Fully observed (no imputation required)
q1	0	No	Fully observed (no imputation required)
s1	0	No	Fully observed (no imputation required)
s2	0	No	Fully observed (no imputation required)
x1	0	No	Fully observed (no imputation required)
z4	0	No	Fully observed (no imputation required)

5. Parallel analysis

This section determines the number of components to retain using parallel analysis implemented in psych::fa.parallel, applied to the MICE-imputed, z-scored dataset. Spearman correlations with listwise deletion are used to obtain empirical eigenvalues, which are compared with random-data eigenvalues over 6,000 Monte Carlo replications. The retained dimensionality is written to file and stored for use in the PCA cycles.

dir_ap <- file.path(DIR_OUT, "05_parallel")
dir.create(dir_ap, recursive = TRUE, showWarnings = FALSE)

data2_ap <- data2_norm %>%
  dplyr::select(dplyr::where(is.numeric)) %>%
  as.data.frame()

set.seed(CFG$SEED)

fp <- psych::fa.parallel(
  data2_ap,
  fa          = "pc",
  n.iter      = CFG$ap_iter,
  show.legend = FALSE,
  error.bars  = FALSE,
  plot        = FALSE
)

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  11

if (!is.null(fp$ncomp) && is.finite(fp$ncomp)) {
  ncomp <- as.integer(fp$ncomp)
} else {
  stop("Parallel analysis (fa.parallel) could not determine number of components.")
}

write_xlsx_6(
  file.path(dir_ap, "parallel_result.xlsx"),
  list(
    parallel = data.frame(
      tool         = "fa.parallel",
      iterations   = CFG$ap_iter,
      n_components = ncomp
    )
  )
)

CFG$n_components <- ncomp

6. PCA cycles and internal consistency

This section implements the full PCA workflow based on the number of components identified in the parallel analysis. We introduce helper functions for listwise PCA with ProMax rotation, apply explicit flagging rules for low loadings, cross-loadings, and weak components, and record each PCA cycle (including loadings, diagnostics, and items removed) until a stable solution is reached. The final rotated pattern matrix is then reported, followed by internal-consistency estimates for each retained component.

6.1. PCA helper functions (listwise, ProMax rotation)

This subsection introduces the helper function that performs listwise PCA followed by ProMax oblique rotation. It removes constant-variance indicators, estimates the unrotated PCA solution, applies ProMax rotation, and returns key diagnostics including KMO, Bartlett’s test, pattern and structure loadings, component correlations, communalities, and the listwise sample size.

`%||%` <- function(x, y) if (is.null(x)) y else x
compute_pca_listwise <- function(data_z, n_components) {
  stopifnot(is.data.frame(data_z))
  stopifnot(length(n_components) == 1L, is.finite(n_components), n_components >= 1L)
  
  base_num <- data_z %>%
    dplyr::select(dplyr::where(is.numeric)) %>%
    as.data.frame()
  
  sds <- apply(base_num, 2, sd, na.rm = TRUE)
  const_vars <- names(sds)[!is.finite(sds) | sds == 0]
  if (length(const_vars)) {
    base_num <- base_num[, setdiff(names(base_num), const_vars), drop = FALSE]
  }
  if (ncol(base_num) < 2L) {
    stop("compute_pca_listwise: fewer than 2 usable variables after removing constants.")
  }
  
  base_cc <- stats::na.omit(base_num)
  if (nrow(base_cc) < 5L) {
    stop("compute_pca_listwise: too few complete cases for PCA.")
  }
  
  pca_raw <- psych::principal(
    r        = base_cc,
    nfactors = n_components,
    rotate   = "none",
    covar    = FALSE,
    scores   = FALSE,
    missing  = FALSE,
    oblique.scores = TRUE,
    n.obs    = nrow(base_cc)
  )
  
  L_unrot <- as.matrix(unclass(pca_raw$loadings))
  rot     <- promax_rotate(L_unrot, m = CFG$PROMAX_M)
  
  pca_raw$loadings <- rot$L
  pca_raw$Phi      <- rot$Phi
  
  R   <- stats::cor(base_cc, use = "everything")
  KMO <- try(psych::KMO(R), silent = TRUE)
  if (inherits(KMO, "try-error")) KMO <- list(MSA = NA_real_)
  
  Bart <- try(psych::cortest.bartlett(R, n = nrow(base_cc)), silent = TRUE)
  if (inherits(Bart, "try-error")) Bart <- list(p.value = NA_real_)
  
  loadings_pattern   <- as.matrix(unclass(pca_raw$loadings))
  Phi                <- if (!is.null(pca_raw$Phi)) as.matrix(pca_raw$Phi) else diag(ncol(loadings_pattern))
  loadings_structure <- loadings_pattern %*% Phi
  
  h2 <- as.numeric(pca_raw$communality)
  names(h2) <- rownames(loadings_pattern)
  
  list(
    pca               = pca_raw,
    R                 = R,
    KMO               = KMO,
    Bartlett          = Bart,
    n_listwise        = nrow(base_cc),
    loadings_pattern  = loadings_pattern,
    loadings_structure= loadings_structure,
    phi               = Phi,
    h2                = h2,
    const_vars        = const_vars
  )
}

6.2. Flag rules for item retention

This subsection defines the rules used to flag items during the PCA cycles. For each indicator, the function identifies the primary and secondary loadings, checks whether the primary loading meets the minimum threshold, detects cross-loadings based on the loading difference and relevance criteria, and identifies weak components with fewer than the required number of primary items. These flags guide item removal and component refinement in subsequent PCA cycles.

compute_flags <- function(loadings_mat,
                          crit_carga_min   = CFG$carga_min,
                          crit_cross_delta = CFG$cross_delta,
                          min_items_comp   = CFG$min_items_comp) {
  abs_mat <- abs(loadings_mat)
  
  top2 <- t(apply(
    abs_mat, 1,
    function(v) {
      vv <- sort(v, decreasing = TRUE)
      c(vv[1], ifelse(length(vv) >= 2L, vv[2], 0))
    }
  ))
  colnames(top2) <- c("load1", "load2")
  
  primary   <- apply(abs_mat, 1, which.max)
  secondary <- apply(
    abs_mat, 1,
    function(v) {
      o <- order(v, decreasing = TRUE)
      if (length(o) >= 2L) o[2] else NA_integer_
    }
  )
  
  load1 <- top2[, 1]
  load2 <- top2[, 2]
  delta <- load1 - load2
  
  n_comp_relev <- apply(abs_mat, 1, function(v) sum(v >= crit_carga_min))
  
  flag_relevance <- load1 < crit_carga_min
  
  flag_cross <- (n_comp_relev >= 2L) &
    (delta <= crit_cross_delta) &
    (load1 >= crit_carga_min) &
    (load2 >= crit_carga_min)
  
  prim_ok       <- which(load1 >= crit_carga_min)
  contagem_prim <- table(factor(primary[prim_ok], levels = seq_len(ncol(abs_mat))))
  comp_weak_idx <- as.integer(names(contagem_prim)[contagem_prim < min_items_comp])
  if (!length(comp_weak_idx)) comp_weak_idx <- integer(0)
  
  flags <- data.frame(
    variable           = rownames(loadings_mat),
    comp_primary       = primary,
    loading_primary    = round(load1, 4),
    comp_secondary     = secondary,
    loading_secondary  = round(load2, 4),
    delta_top2         = round(delta, 4),
    n_components_relev = n_comp_relev,
    flag_relevance     = flag_relevance,
    flag_crossloading  = flag_cross,
    stringsAsFactors   = FALSE
  )
  
  list(
    flags         = flags,
    comp_weak_idx = comp_weak_idx,
    count_primary = contagem_prim
  )
}

6.3. Write PCA cycle to disk + summary row

This subsection provides utilities to record each PCA cycle. The functions save diagnostics (correlations, KMO, Bartlett), rotated loadings, component correlations, explained variance, and all item-level flags, while also generating a summary row that tracks items removed, weak components, and cumulative variance across cycles.

write_pca_cycle <- function(res,
                            cycle_id,
                            dir_cycle,
                            crit_carga_min   = CFG$carga_min,
                            crit_cross_delta = CFG$cross_delta,
                            min_items_comp   = CFG$min_items_comp,
                            notes            = NULL) {
  dir.create(dir_cycle, recursive = TRUE, showWarnings = FALSE)
  
  log_path <- file.path(dir_cycle, "pca_cycle_log.md")
  writeLines(
    c(
      "# PCA cycle log",
      paste0("- Cycle ID: ", cycle_id),
      paste0("- Date: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S")),
      paste0("- Rotation: ProMax (m = ", CFG$PROMAX_M, ")"),
      paste0("- n (listwise): ", res$n_listwise),
      paste0("- Criteria: |loading| ≥ ", crit_carga_min,
             "; Δ < ", crit_cross_delta,
             "; min_items_comp = ", min_items_comp)
    ),
    con = log_path
  )
  if (length(res$const_vars)) {
    cat(
      "\n- Variables with near-zero variance: ",
      paste(res$const_vars, collapse = ", "),
      file = log_path, append = TRUE
    )
  }
  if (!is.null(notes) && length(notes)) {
    for (ln in notes) {
      cat(paste0("\n- ", ln), file = log_path, append = TRUE)
    }
  }
  
  utils::write.csv(
    res$R,
    file = file.path(dir_cycle, "correlation_matrix_cycle.csv"),
    row.names = TRUE
  )
  sink(file.path(dir_cycle, "kmo_cycle.txt"));      print(res$KMO);     sink()
  sink(file.path(dir_cycle, "bartlett_cycle.txt")); print(res$Bartlett); sink()
  
  utils::write.csv(
    round(res$loadings_pattern, 6),
    file = file.path(dir_cycle, "loadings_pattern.csv"),
    row.names = TRUE
  )
  utils::write.csv(
    round(res$loadings_structure, 6),
    file = file.path(dir_cycle, "loadings_structure.csv"),
    row.names = TRUE
  )
  utils::write.csv(
    round(res$phi, 6),
    file = file.path(dir_cycle, "phi_components_correlations.csv"),
    row.names = TRUE
  )
  
  ev <- as.numeric(res$pca$values)
  pv <- ev / sum(ev, na.rm = TRUE)
  explained_var <- data.frame(
    Component      = seq_along(ev),
    Eigenvalue     = round(ev, 6),
    Proportion_Var = round(pv, 6),
    Cumulative_Var = round(cumsum(pv), 6)
  )
  
  fl                  <- compute_flags(
    res$loadings_pattern,
    crit_carga_min   = crit_carga_min,
    crit_cross_delta = crit_cross_delta,
    min_items_comp   = min_items_comp
  )
  flags              <- fl$flags
  comp_weak_idx      <- fl$comp_weak_idx
  count_primary      <- fl$count_primary
  
  comp_summary <- data.frame(
    component        = seq_len(ncol(res$loadings_pattern)),
    n_primary_items  = as.integer(count_primary[seq_len(ncol(res$loadings_pattern))]),
    weak_component   = seq_len(ncol(res$loadings_pattern)) %in% comp_weak_idx
  )
  
  write_xlsx_6(
    file.path(dir_cycle, "pca_cycle_tables.xlsx"),
    list(
      correlation_matrix = as.data.frame(res$R) %>%
        tibble::rownames_to_column("variable"),
      loadings_pattern   = as.data.frame(round(res$loadings_pattern, 6)),
      loadings_structure = as.data.frame(round(res$loadings_structure, 6)),
      phi                = as.data.frame(round(res$phi, 6)),
      explained_variance = explained_var,
      review_panel       = flags,
      components_summary = comp_summary
    )
  )
  
  if (any(flags$flag_relevance)) {
    cat(
      "\n- Items below loading threshold: ",
      paste(flags$variable[flags$flag_relevance], collapse = ", "),
      file = log_path, append = TRUE
    )
  }
  if (any(flags$flag_crossloading)) {
    cat(
      "\n- Items with cross-loadings: ",
      paste(flags$variable[flags$flag_crossloading], collapse = ", "),
      file = log_path, append = TRUE
    )
  }
  if (length(comp_weak_idx)) {
    cat(
      "\n- Weak components (fewer than ",
      min_items_comp, " primary items): ",
      paste(comp_weak_idx, collapse = ", "),
      file = log_path, append = TRUE
    )
  }
  
  list(
    flags         = flags,
    comp_weak_idx = comp_weak_idx,
    explained_var = explained_var
  )
}

build_summary_row <- function(res, cycle_id, n_components, dropped_vars, weak_components) {
  ev <- as.numeric(res$pca$values)
  pv <- ev / sum(ev, na.rm = TRUE)
  var_cum <- sum(pv[seq_len(min(n_components, length(ev)))], na.rm = TRUE) * 100
  
  data.frame(
    cycle_id         = cycle_id,
    n_components     = n_components,
    n_listwise       = res$n_listwise,
    kmo_overall      = round(suppressWarnings(unname(res$KMO$MSA)), 3),
    bartlett_p       = signif(suppressWarnings(unname(res$Bartlett$p.value)), 3),
    cum_variance_pct = round(var_cum, 2),
    n_items_removed  = length(dropped_vars),
    items_removed    = if (length(dropped_vars)) paste(dropped_vars, collapse = "; ") else "",
    n_weak_components= length(weak_components),
    weak_components  = if (length(weak_components)) paste(weak_components, collapse = "; ") else "",
    stringsAsFactors = FALSE
  )
}

6.4. Iterative PCA cycles until a stable solution

This subsection implements the iterative PCA procedure. Starting from the number of components suggested by the parallel analysis, the algorithm repeatedly runs listwise PCA, removes items flagged for low loadings or cross-loadings, adjusts for weak components, and updates the solution until no further items are dropped and the component structure stabilizes.

pca_cycles_until_stable <- function(data_z,
                                    n_initial,
                                    base_label  = "MICE_z_listwise",
                                    max_cycles  = 10L,
                                    dir_base    = file.path(DIR_OUT, "06_pca_cycles"),
                                    crit_carga_min   = CFG$carga_min,
                                    crit_cross_delta = CFG$cross_delta,
                                    min_items_comp   = CFG$min_items_comp,
                                    excluded_initial = character(0)) {
  dir_branch <- file.path(dir_base, base_label)
  dir.create(dir_branch, recursive = TRUE, showWarnings = FALSE)
  
  excluded      <- unique(excluded_initial)
  results       <- list()
  summary_rows  <- list()
  
  ncomp      <- as.integer(n_initial)
  if (!is.finite(ncomp) || ncomp < 1L) {
    stop("pca_cycles_until_stable: invalid initial number of components.")
  }
  prev_ncomp <- NA_integer_
  
  for (k in seq_len(max_cycles)) {
    cycle_id <- sprintf("%s_c%02d_%dcomp", base_label, k, ncomp)
    
    data_cycle <- data_z[, setdiff(colnames(data_z), excluded), drop = FALSE]
    
    res_core <- compute_pca_listwise(
      data_z       = data_cycle,
      n_components = ncomp
    )
    
    w <- write_pca_cycle(
      res        = res_core,
      cycle_id   = cycle_id,
      dir_cycle  = file.path(dir_branch, cycle_id),
      crit_carga_min   = crit_carga_min,
      crit_cross_delta = crit_cross_delta,
      min_items_comp   = min_items_comp,
      notes      = "MICE + z-score"
    )
    
    res <- list(
      pca               = res_core$pca,
      loadings_pattern  = res_core$loadings_pattern,
      loadings_structure= res_core$loadings_structure,
      phi               = res_core$phi,
      communalities     = res_core$h2,
      n_listwise        = res_core$n_listwise,
      KMO               = res_core$KMO,
      Bartlett          = res_core$Bartlett,
      flags             = w$flags,
      weak_components   = w$comp_weak_idx
    )
    
    results[[cycle_id]] <- res
    
    flg        <- res$flags
    drop_rel   <- flg$variable[flg$flag_relevance]
    drop_cross <- flg$variable[flg$flag_crossloading]
    drop_vars  <- unique(c(drop_rel, drop_cross))
    weak_comps <- res$weak_components
    
    summary_rows[[cycle_id]] <- build_summary_row(
      res            = res_core,
      cycle_id       = cycle_id,
      n_components   = ncomp,
      dropped_vars   = drop_vars,
      weak_components= weak_comps
    )
    
    if (length(drop_vars) == 0L &&
        length(weak_comps) == 0L &&
        (is.na(prev_ncomp) || ncomp == prev_ncomp)) {
      summary_df <- dplyr::bind_rows(summary_rows)
      writexl::write_xlsx(
        list(final_summary = summary_df),
        path = file.path(dir_branch, "pca_cycles_summary.xlsx")
      )
      return(list(
        stable     = TRUE,
        cycles     = results,
        excluded   = excluded,
        solution   = res,
        summary    = summary_df,
        dir_branch = dir_branch
      ))
    }
    
    excluded   <- unique(c(excluded, drop_vars))
    prev_ncomp <- ncomp
    ncomp      <- max(1L, ncomp - length(weak_comps))
  }
  
  summary_df <- if (length(summary_rows)) dplyr::bind_rows(summary_rows) else data.frame()
  
  list(
    stable     = FALSE,
    cycles     = results,
    excluded   = excluded,
    solution   = if (length(results)) tail(results, 1)[[1]] else NULL,
    summary    = summary_df,
    dir_branch = dir_branch
  )
}

6.5. Items per component and reliability (alpha, omega)

This subsection extracts the retained items for each rotated component and computes their reliability. It identifies the items loading above the cutoff, groups them by primary component, and then estimates Cronbach’s alpha and McDonald’s omega (total and hierarchical), saving all item-level and component-level diagnostics for inspection.

items_per_component <- function(flags_df, cutoff = CFG$carga_min) {
  stopifnot(all(c("variable", "comp_primary", "loading_primary") %in% names(flags_df)))
  
  keep <- flags_df$loading_primary >= cutoff
  if (!any(keep)) return(list())
  
  tmp <- flags_df[keep, c("variable", "comp_primary", "loading_primary")]
  df_list <- split(tmp, tmp$comp_primary)
  
  lst <- lapply(df_list, function(d) {
    d$variable[order(-abs(d$loading_primary), d$variable)]
  })
  
  idx <- as.integer(names(lst))
  ord <- order(idx)
  lst <- lst[ord]
  names(lst) <- paste0("RC", idx[ord])
  lst
}

component_reliability <- function(data_z,
                                  groups,
                                  name_solution = "MICE_z_listwise",
                                  dir_base      = file.path(DIR_OUT, "07_reliability"),
                                  mode          = c("listwise", "pairwise")) {
  mode <- match.arg(mode)
  stopifnot(is.list(groups), is.data.frame(data_z))
  
  use_param <- if (mode == "pairwise") "pairwise.complete.obs" else "complete.obs"
  
  dir_sol <- file.path(dir_base, name_solution)
  dir.create(dir_sol, recursive = TRUE, showWarnings = FALSE)
  
  summary_list <- list()
  
  for (nm in names(groups)) {
    items <- intersect(groups[[nm]], colnames(data_z))
    
    if (length(items) < 2L) {
      warning(sprintf("[%s] %s: fewer than 2 items; skipping.", name_solution, nm))
      next
    }
    
    base_comp  <- data_z[, items, drop = FALSE]
    base_cc    <- stats::na.omit(base_comp)
    n_listwise <- nrow(base_cc)
    
    if (n_listwise >= 2L) {
      R <- stats::cor(base_cc, use = "everything")
      utils::write.csv(
        R,
        file = file.path(dir_sol, paste0(nm, "_item_correlations_listwise.csv")),
        row.names = TRUE
      )
    }
    
    a <- try(
      psych::alpha(
        base_comp,
        keys       = NULL,
        warnings   = FALSE,
        check.keys = TRUE,
        na.rm      = TRUE,
        use        = use_param
      ),
      silent = TRUE
    )
    
    suppressMessages({
      o <- try(
        psych::omega(
          base_cc,
          nfactors = min(3L, length(items) - 1L),
          plot     = FALSE,
          fm       = "ml"
        ),
        silent = TRUE
      )
    })
    
    alpha_raw <- if (!inherits(a, "try-error")) unname(a$total$raw_alpha) else NA_real_
    alpha_std <- if (!inherits(a, "try-error")) unname(a$total$std.alpha) else NA_real_
    av_r      <- if (!inherits(a, "try-error")) unname(a$total$average_r) else NA_real_
    n_total   <- nrow(base_comp)
    
    omega_tot <- if (!inherits(o, "try-error") && !is.null(o$omega.tot)) unname(o$omega.tot) else NA_real_
    omega_h   <- if (!inherits(o, "try-error") && !is.null(o$omega_h))    unname(o$omega_h)    else NA_real_
    
    comp_dir <- file.path(dir_sol, nm)
    dir.create(comp_dir, recursive = TRUE, showWarnings = FALSE)
    
    sink(file.path(comp_dir, sprintf("%s_alpha_%s.txt", nm, mode)))
    if (!inherits(a, "try-error")) print(a) else cat("alpha failed")
    sink()
    
    sink(file.path(comp_dir, sprintf("%s_omega_%s.txt", nm, mode)))
    if (!inherits(o, "try-error")) print(o) else cat("omega failed")
    sink()
    
    if (!inherits(a, "try-error") && !is.null(a$item.stats)) {
      it_tbl <- a$item.stats
      it_tbl$item <- rownames(it_tbl)
      rownames(it_tbl) <- NULL
      utils::write.csv(
        it_tbl,
        file = file.path(comp_dir, sprintf("%s_alpha_item_stats_%s.csv", nm, mode)),
        row.names = FALSE
      )
    }
    
    if (!inherits(a, "try-error") && !is.null(a$alpha.drop)) {
      ad <- a$alpha.drop
      ad$item <- rownames(ad)
      rownames(ad) <- NULL
      utils::write.csv(
        ad,
        file = file.path(comp_dir, sprintf("%s_alpha_if_deleted_%s.csv", nm, mode)),
        row.names = FALSE
      )
    }
    
    summary_list[[nm]] <- data.frame(
      solution_name = name_solution,
      component     = nm,
      n_items       = length(items),
      n_total       = n_total,
      n_listwise    = n_listwise,
      alpha_raw     = round(alpha_raw, 3),
      alpha_std     = round(alpha_std, 3),
      mean_item_r   = round(av_r, 3),
      omega_total   = round(omega_tot, 3),
      omega_hier    = round(omega_h, 3),
      items         = paste(items, collapse = ", "),
      stringsAsFactors = FALSE
    )
  }
  
  summary_df <- dplyr::bind_rows(summary_list)
  if (nrow(summary_df)) {
    utils::write.csv(
      summary_df,
      file = file.path(dir_sol, sprintf("reliability_summary_%s.csv", mode)),
      row.names = FALSE
    )
  }
  invisible(summary_df)
}

6.6. Run PCA cycles starting from the parallel-analysis solution

This subsection initiates the iterative PCA procedure by using the number of components identified in the parallel analysis as the starting point. The algorithm runs successive PCA cycles, removing flagged items and adjusting component structure, until a stable rotated solution is reached.

dir_pca <- file.path(DIR_OUT, "06_pca_cycles")
dir.create(dir_pca, recursive = TRUE, showWarnings = FALSE)

data_pca <- data2_norm %>%
  dplyr::select(dplyr::where(is.numeric)) %>%
  as.data.frame()

n_components_ini <- CFG$n_components

pca_auto <- pca_cycles_until_stable(
  data_z       = data_pca,
  n_initial    = n_components_ini,
  base_label   = "MICE_z_listwise",
  max_cycles   = 10L,
  dir_base     = dir_pca,
  crit_carga_min   = CFG$carga_min,
  crit_cross_delta = CFG$cross_delta,
  min_items_comp   = CFG$min_items_comp,
  excluded_initial = character(0)
)

pca_auto$stable

## [1] TRUE

6.7. Summary of PCA cycles

This subsection summarizes all PCA cycles, documenting the results obtained under the predefined decision rules: the number of components from the parallel analysis, the ProMax-rotated listwise PCA, the loading cutoff (|loading| ≥ 0.30), the cross-loading criterion (Δ < 0.10), and the minimum requirement of three primary items per component. The table reports, for each cycle, the items removed, weak components, key diagnostics (KMO and Bartlett), and cumulative variance explained, until the final stable solution is reached.

pca_summary <- pca_auto$summary

pca_summary_print <- pca_summary %>%
  dplyr::rename(
    `Cycle`                   = cycle_id,
    `Components`              = n_components,
    `n (listwise)`            = n_listwise,
    `KMO`                     = kmo_overall,
    `Bartlett p`              = bartlett_p,
    `Cumulative variance (%)` = cum_variance_pct,
    `Items removed (n)`       = n_items_removed,
    `Items removed`           = items_removed,
    `Weak components (n)`     = n_weak_components,
    `Weak components`         = weak_components
  )

kbl(
  pca_summary_print,
  caption = "Summary of PCA cycles",
  booktabs = TRUE,
  align = "lccccccccc",
  escape = TRUE
) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    font_size         = 11,
    position          = "center"
  )

Summary of PCA cycles
Cycle	Components	n (listwise)	KMO	Cumulative variance (%)	Items removed (n)	Items removed	Weak components (n)	Weak components
MICE_z_listwise_c01_11comp	11	5570	0.838	54.30	6	q1; d1; ias2; ias8; r2; s2	3	7; 9; 11
MICE_z_listwise_c02_8comp	8	5570	0.821	49.34	1	ig4	1	7
MICE_z_listwise_c03_7comp	7	5570	0.822	47.34	2	z4; v6	1	7
MICE_z_listwise_c04_6comp	6	5570	0.818	45.89	2	ias9; ii11	0
MICE_z_listwise_c05_6comp	6	5570	0.819	48.16	1	x1	0
MICE_z_listwise_c06_6comp	6	5570	0.820	49.47	0		0

6.8. Final rotated solution: loadings (pattern matrix)

This subsection presents the final rotated solution obtained after the iterative PCA cycles. The table reports the ProMax-rotated pattern matrix, showing the retained indicators and their primary loadings in the stable component structure.

pca_solution <- pca_auto$solution
loadings_final <- as.data.frame(round(pca_solution$loadings_pattern, 3)) %>%
  tibble::rownames_to_column("Indicator")

kbl(
  loadings_final,
  caption = "Final component loadings (pattern matrix)",
  booktabs = TRUE,
  align = "lcccccccc",
  escape = TRUE
) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    font_size         = 10,
    position          = "center"
  )

Final component loadings (pattern matrix)
Indicator	PC1	PC2	PC3	PC4	PC5	PC6
b3	-0.001	-0.048	-0.003	0.023	0.003	0.703
b4	-0.098	-0.075	-0.198	-0.008	0.059	-0.342
b5	-0.033	0.020	0.052	-0.020	0.419	0.120
c3	0.240	0.150	0.175	-0.094	-0.005	0.412
f1	0.070	0.743	0.044	-0.004	-0.123	-0.091
f2	-0.068	0.898	-0.099	0.021	0.073	0.117
h1	0.028	0.772	-0.011	0.008	-0.075	-0.223
h2	-0.044	0.908	-0.031	0.020	0.035	-0.066
i1	-0.074	-0.018	-0.262	0.076	0.066	0.455
ias4	-0.142	0.103	-0.626	0.042	-0.008	-0.043
ias6	0.572	-0.047	0.321	-0.032	-0.046	-0.332
ias7	0.563	-0.003	-0.226	0.113	-0.085	0.252
ig8	0.775	0.040	-0.074	0.011	-0.089	0.028
ig51	-0.010	0.002	0.040	0.880	0.010	0.054
ig52	0.195	0.012	-0.019	0.783	0.041	-0.002
ig53	-0.018	0.036	0.045	0.883	0.012	0.055
ii1	0.132	0.134	0.145	-0.050	0.362	-0.063
ii3	-0.207	-0.035	-0.008	0.036	-0.060	0.608
ii5	0.213	-0.002	0.062	-0.138	-0.098	0.414
ii6	0.577	0.009	-0.286	-0.016	0.134	0.013
ii10	0.091	0.014	0.125	0.055	-0.049	0.693
im1	-0.197	0.120	0.210	-0.008	-0.042	-0.664
n1	-0.271	-0.030	0.789	0.000	-0.048	-0.117
n2	0.221	0.017	-0.886	-0.071	0.046	-0.143
o6	0.086	0.004	-0.089	-0.119	0.016	0.388
p1	-0.030	-0.012	-0.199	-0.042	0.621	-0.092
pa1	0.524	-0.063	0.186	0.204	0.014	0.064
pa3	0.067	0.022	-0.535	0.004	-0.136	0.215
s1	0.322	0.044	0.060	-0.141	0.126	0.211
v3	0.748	-0.062	-0.300	-0.016	-0.031	0.050
v4	-0.295	0.057	0.280	0.016	0.055	0.493
w1	-0.085	-0.033	0.038	0.043	0.652	0.034
w2	-0.072	0.005	-0.002	0.059	0.631	-0.020

6.9. Internal consistency of rotated components

This subsection reports the internal consistency of the final rotated components. For each retained component, we compute Cronbach’s alpha, mean inter-item correlation, and McDonald’s omega (total and hierarchical) using listwise data, providing a reliability profile for the stable PCA solution.

groups_rc <- items_per_component(
  flags  = pca_solution$flags,
  cutoff = CFG$carga_min
)

rel_df <- component_reliability(
  data_z        = data_pca,
  groups        = groups_rc,
  name_solution = "MICE_z_listwise",
  dir_base      = file.path(DIR_OUT, "07_reliability"),
  mode          = "listwise"
)

rel_print <- rel_df %>%
  dplyr::select(
    Component            = component,
    `Items (n)`          = n_items,
    `n (listwise)`       = n_listwise,
    `Cronbach's alpha`   = alpha_std,
    `Mean item r`        = mean_item_r,
    `Omega total`        = omega_total,
    `Omega hierarchical` = omega_hier,
    Items                = items
  )

kbl(
  rel_print,
  caption = "Reliability of PCA components",
  booktabs = TRUE,
  align = "lccccccc",
  escape = TRUE
) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    font_size         = 11,
    position          = "center"
  )

Reliability of PCA components
Component	Items (n)	n (listwise)	Cronbach’s alpha	Mean item r	Omega total	Omega hierarchical	Items
RC1	7	5570	0.734	0.283	0.829	0.720	ig8, v3, ii6, ias6, ias7, pa1, s1
RC2	4	5570	0.873	0.633	0.953	0.757	h2, f2, h1, f1
RC3	4	5570	0.751	0.429	0.847	0.743	n2, n1, ias4, pa3
RC4	3	5570	0.851	0.656	0.913	0.785	ig53, ig51, ig52
RC5	5	5570	0.432	0.132	0.548	0.254	w1, w2, p1, b5, ii1
RC6	10	5570	0.706	0.194	0.750	0.625	b3, ii10, im1, ii3, v4, i1, ii5, c3, o6, b4

7. Reproducibility

All analyses were performed in R, using the packages listed in the setup section of this document. The session information below allows full reproducibility of the computational environment.

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Portuguese_Brazil.utf8  LC_CTYPE=Portuguese_Brazil.utf8   
## [3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C                      
## [5] LC_TIME=Portuguese_Brazil.utf8    
## 
## time zone: America/Sao_Paulo
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0  mice_3.18.0       writexl_1.5.4     R.utils_2.13.0   
##  [5] R.oo_1.27.1       R.methodsS3_1.8.2 paran_1.5.4       MASS_7.3-65      
##  [9] Hmisc_5.2-3       skimr_2.2.1       janitor_2.2.1     readxl_1.4.5     
## [13] naniar_1.1.0      psych_2.5.6       lubridate_1.9.4   forcats_1.0.0    
## [17] stringr_1.5.1     dplyr_1.1.4       purrr_1.1.0       readr_2.1.5      
## [21] tidyr_1.3.1       tibble_3.3.0      ggplot2_4.0.0     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.4         mnormt_2.1.1         gridExtra_2.3       
##  [4] rlang_1.1.6          magrittr_2.0.3       snakecase_0.11.1    
##  [7] compiler_4.5.1       png_0.1-8            systemfonts_1.2.3   
## [10] vctrs_0.6.5          pkgconfig_2.0.3      shape_1.4.6.1       
## [13] fastmap_1.2.0        backports_1.5.0      labeling_0.4.3      
## [16] rmarkdown_2.30       tzdb_0.5.0           nloptr_2.2.1        
## [19] ragg_1.4.0           visdat_0.6.0         xfun_0.52           
## [22] glmnet_4.1-10        jomo_2.7-6           cachem_1.1.0        
## [25] jsonlite_2.0.0       pan_1.9              broom_1.0.9         
## [28] parallel_4.5.1       cluster_2.1.8.1      R6_2.6.1            
## [31] bslib_0.9.0          stringi_1.8.7        RColorBrewer_1.1-3  
## [34] boot_1.3-31          rpart_4.1.24         jquerylib_0.1.4     
## [37] cellranger_1.1.0     Rcpp_1.1.0           iterators_1.0.14    
## [40] knitr_1.50           base64enc_0.1-3      Matrix_1.7-3        
## [43] splines_4.5.1        nnet_7.3-20          timechange_0.3.0    
## [46] tidyselect_1.2.1     rstudioapi_0.17.1    dichromat_2.0-0.1   
## [49] yaml_2.3.10          codetools_0.2-20     lattice_0.22-7      
## [52] withr_3.0.2          S7_0.2.0             evaluate_1.0.5      
## [55] foreign_0.8-90       survival_3.8-3       norm_1.0-11.1       
## [58] xml2_1.4.0           pillar_1.11.0        checkmate_2.3.3     
## [61] foreach_1.5.2        reformulas_0.4.1     generics_0.1.4      
## [64] hms_1.1.3            scales_1.4.0         minqa_1.2.8         
## [67] glue_1.8.0           tools_4.5.1          data.table_1.17.8   
## [70] lme4_1.1-37          grid_4.5.1           rbibutils_2.3       
## [73] colorspace_2.1-1     nlme_3.1-168         repr_1.1.7          
## [76] htmlTable_2.4.3      Formula_1.2-5        cli_3.6.5           
## [79] textshaping_1.0.1    viridisLite_0.4.2    svglite_2.2.1       
## [82] gtable_0.3.6         sass_0.4.10          digest_0.6.37       
## [85] GPArotation_2025.3-1 htmlwidgets_1.6.4    farver_2.1.2        
## [88] htmltools_0.5.8.1    lifecycle_1.0.4      mitml_0.4-5

Supplementary Material (S1): PCA code, results, and analysis