library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lavaan)
## This is lavaan 0.6-15
## lavaan is FREE software! Please report any bugs.
library(semPlot)
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
data_sem <- read.csv('./data/EEG_RS_lanExp_income_imputated_all_v5.csv') %>%
  rename(Age = Age_EEG_yr)
model_2 <- '
  # Level 1: Direct paths
  CDRAN ~ a * PW + SES + Age
  CWR ~ b1 * CDRAN + c1 * PW + SES + Age
  CDICT ~ b2 * CDRAN + c2 * PW + SES + Age
  EWR ~ b3 * CDRAN + c3 * PW + SES + Age
  EDICT ~ b4 * CDRAN + c4 * PW + SES + Age

  # Residual covariances to control for shared variance
  CWR ~~ CDICT
  CWR ~~ EDICT
  CWR ~~ EWR
  CDICT ~~ EDICT
  CDICT ~~ EWR
  EDICT ~~ EWR

  # Level 2: Between-group model
  PW ~ 1  # Random intercept for PW
  CDRAN ~ 1
  CWR ~ 1
  CDICT ~ 1
  EDICT ~ 1
  EWR ~ 1
  
  # DEFINE INDIRECT AND TOTAL EFFECTS: PW -> CDRAN -> Academic Skills
  # Indirect effects (PW -> CDRAN -> Skill)
  Indirect_PW_CWR := a * b1
  Indirect_PW_CDICT := a * b2
  Indirect_PW_EDICT := a * b3
  Indirect_PW_EWR := a * b4

  # Total effects (Indirect + Direct)
  Total_PW_CWR := (a * b1) + c1
  Total_PW_CDICT := (a * b2) + c2
  Total_PW_EDICT := (a * b3) + c3
  Total_PW_EWR := (a * b4) + c4
'

# Fit the model
fit_2 <- sem(model_2, data = data_sem, cluster = 'FID')
## Warning in lav_options_set(opt): observed.information for ALL test statistics
## is set to h1.
## Warning in lav_data_full(data = data, group = group, cluster = cluster, : lavaan WARNING: cluster variable 'FID' contains missing values
## Warning in lav_data_full(data = data, group = group, cluster = cluster, :
## lavaan WARNING: some observed variances are (at least) a factor 1000 times
## larger than others; use varTable(fit) to investigate
## Warning in lav_partable_vnames(FLAT, "ov.x", warn = TRUE): lavaan WARNING:
##     model syntax contains variance/covariance/intercept formulas
##     involving (an) exogenous variable(s): [PW]; These variables will
##     now be treated as random introducing additional free parameters.
##     If you wish to treat those variables as fixed, remove these
##     formulas from the model syntax. Otherwise, consider adding the
##     fixed.x = FALSE option.
# For bootstrap CIs with clustered data, we need to use a different approach
# First, let's get the regular results
summary(fit_2, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6.15 ended normally after 202 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        37
## 
##                                                   Used       Total
##   Number of observations                           135         202
##   Number of clusters [FID]                          74            
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                 0.945       0.651
##   Degrees of freedom                                 2           2
##   P-value (Chi-square)                           0.623       0.722
##   Scaling correction factor                                  1.452
##     Yuan-Bentler correction (Mplus variant)                       
##   Observed information based on                     H1            
## 
## Model Test Baseline Model:
## 
##   Test statistic                               600.169     458.233
##   Degrees of freedom                                27          27
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.310
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       1.025       1.042
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.047
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -1734.630   -1734.630
##   Scaling correction factor                                  1.326
##       for the MLR correction                                      
##   Loglikelihood unrestricted model (H1)      -1734.157   -1734.157
##   Scaling correction factor                                  1.332
##       for the MLR correction                                      
##                                                                   
##   Akaike (AIC)                                3543.260    3543.260
##   Bayesian (BIC)                              3650.755    3650.755
##   Sample-size adjusted Bayesian (SABIC)       3533.712    3533.712
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.137       0.093
##   P-value H_0: RMSEA <= 0.050                    0.711       0.866
##   P-value H_0: RMSEA >= 0.080                    0.191       0.071
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.147
##   P-value H_0: Robust RMSEA <= 0.050                         0.772
##   P-value H_0: Robust RMSEA >= 0.080                         0.168
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.023       0.023
## 
## Parameter Estimates:
## 
##   Standard errors                        Robust.cluster
##   Information                                  Observed
##   Observed information based on                 Hessian
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CDRAN ~                                                               
##     PW         (a)    7.100    2.481    2.862    0.004    7.100    0.214
##     SES               0.131    0.051    2.568    0.010    0.131    0.217
##     Age               0.460    0.083    5.543    0.000    0.460    0.402
##   CWR ~                                                                 
##     CDRAN     (b1)   15.480    2.905    5.329    0.000   15.480    0.441
##     PW        (c1)  129.800   71.187    1.823    0.068  129.800    0.111
##     SES              -0.921    1.916   -0.481    0.631   -0.921   -0.043
##     Age              14.364    3.416    4.204    0.000   14.364    0.358
##   CDICT ~                                                               
##     CDRAN     (b2)    2.941    0.924    3.183    0.001    2.941    0.352
##     PW        (c2)   41.652   19.218    2.167    0.030   41.652    0.150
##     SES               0.236    0.506    0.467    0.641    0.236    0.047
##     Age               1.568    0.953    1.645    0.100    1.568    0.164
##   EWR ~                                                                 
##     CDRAN     (b3)    5.047    1.295    3.899    0.000    5.047    0.293
##     PW        (c3)   55.338   32.448    1.705    0.088   55.338    0.097
##     SES               4.739    0.810    5.853    0.000    4.739    0.457
##     Age               5.250    1.565    3.354    0.001    5.250    0.267
##   EDICT ~                                                               
##     CDRAN     (b4)    2.497    0.655    3.811    0.000    2.497    0.310
##     PW        (c4)   34.730   15.568    2.231    0.026   34.730    0.130
##     SES               2.082    0.370    5.627    0.000    2.082    0.428
##     Age               2.312    0.708    3.266    0.001    2.312    0.251
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##  .CWR ~~                                                                
##    .CDICT            98.305   17.975    5.469    0.000   98.305    0.640
##    .EDICT            33.049   13.502    2.448    0.014   33.049    0.281
##    .EWR              77.710   25.961    2.993    0.003   77.710    0.315
##  .CDICT ~~                                                              
##    .EDICT             6.412    3.078    2.083    0.037    6.412    0.193
##    .EWR              11.254    7.278    1.546    0.122   11.254    0.161
##  .EWR ~~                                                                
##    .EDICT            42.993    6.086    7.064    0.000   42.993    0.805
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     PW                0.090    0.003   32.649    0.000    0.090    3.237
##    .CDRAN            -1.178    0.729   -1.616    0.106   -1.178   -1.283
##    .CWR             -97.076   25.753   -3.770    0.000  -97.076   -3.011
##    .CDICT             1.112    7.280    0.153    0.879    1.112    0.145
##    .EDICT           -21.817    5.352   -4.077    0.000  -21.817   -2.948
##    .EWR             -43.917   13.122   -3.347    0.001  -43.917   -2.781
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CDRAN             0.612    0.079    7.745    0.000    0.612    0.726
##    .CWR             542.135   75.674    7.164    0.000  542.135    0.522
##    .CDICT            43.549    5.649    7.710    0.000   43.549    0.742
##    .EWR             112.104   13.882    8.076    0.000  112.104    0.449
##    .EDICT            25.468    3.391    7.511    0.000   25.468    0.465
##     PW                0.001    0.000    7.228    0.000    0.001    1.000
## 
## Defined Parameters:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     Indirct_PW_CWR  109.913   42.051    2.614    0.009  109.913    0.094
##     Indrc_PW_CDICT   20.882    8.690    2.403    0.016   20.882    0.075
##     Indrc_PW_EDICT   35.838   15.433    2.322    0.020   35.838    0.063
##     Indirct_PW_EWR   17.728    8.029    2.208    0.027   17.728    0.066
##     Total_PW_CWR    239.714   77.410    3.097    0.002  239.714    0.206
##     Total_PW_CDICT   62.534   19.906    3.141    0.002   62.534    0.226
##     Total_PW_EDICT   91.176   35.047    2.602    0.009   91.176    0.160
##     Total_PW_EWR     52.458   16.515    3.176    0.001   52.458    0.196
# ============================================================================
# Bootstrap confidence intervals for indirect and total effects
# ============================================================================

# For bootstrap confidence intervals with clustered data, we'll use bootstrapLavaan
set.seed(123)  # For reproducibility
bootstrap_results <- bootstrapLavaan(fit_2, R = 5000, type = "ordinary")

# The bootstrap object does not include defined parameters (Indirect_/Total_).
# We reconstruct indirect and total effects from labeled path coefficients.
needed_coeffs <- c("a","b1","b2","b3","b4","c1","c2","c3","c4")
has_all <- all(needed_coeffs %in% colnames(bootstrap_results))

bootstrap_ci_effects <- data.frame()
if (has_all) {
  # Compute effects for each bootstrap sample
  indirect_total_matrix <- data.frame(
    Indirect_PW_CWR  = bootstrap_results[,"a"]  * bootstrap_results[,"b1"],
    Indirect_PW_CDICT= bootstrap_results[,"a"]  * bootstrap_results[,"b2"],
    Indirect_PW_EDICT= bootstrap_results[,"a"]  * bootstrap_results[,"b3"],
    Indirect_PW_EWR  = bootstrap_results[,"a"]  * bootstrap_results[,"b4"],
    Total_PW_CWR     = bootstrap_results[,"a"]  * bootstrap_results[,"b1"] + bootstrap_results[,"c1"],
    Total_PW_CDICT   = bootstrap_results[,"a"]  * bootstrap_results[,"b2"] + bootstrap_results[,"c2"],
    Total_PW_EDICT   = bootstrap_results[,"a"]  * bootstrap_results[,"b3"] + bootstrap_results[,"c3"],
    Total_PW_EWR     = bootstrap_results[,"a"]  * bootstrap_results[,"b4"] + bootstrap_results[,"c4"]
  )
  # Derive percentile CIs
  for (effect in colnames(indirect_total_matrix)) {
    boot_values <- indirect_total_matrix[[effect]]
    ci_lower <- quantile(boot_values, 0.025, na.rm = TRUE)
    ci_upper <- quantile(boot_values, 0.975, na.rm = TRUE)
    bootstrap_ci_effects <- rbind(
      bootstrap_ci_effects,
      data.frame(label = effect,
                 ci.lower.boot = ci_lower,
                 ci.upper.boot = ci_upper,
                 stringsAsFactors = FALSE)
    )
  }
} else {
  message("Bootstrap coefficients for indirect/total effects could not be reconstructed: missing labels ",
          paste(setdiff(needed_coeffs, colnames(bootstrap_results)), collapse = ", "))
}

# ============================================================================
# Extract results for Word document
# ============================================================================

# Extract parameter estimates
param_estimates <- parameterEstimates(fit_2, standardized = TRUE) %>%
  mutate(
    # Add significance stars
    significance = case_when(
      pvalue < 0.001 ~ "***",
      pvalue < 0.01 ~ "**",
      pvalue < 0.05 ~ "*",
      TRUE ~ ""
    ),
    # Create significance indicator
    significant = pvalue < 0.05,
    # Format path description
    path = paste(lhs, op, rhs),
    # Round values
    across(c(est, se, z, pvalue, ci.lower, ci.upper, std.all), ~round(., 3))
  )

# Merge bootstrap CIs for indirect and total effects
param_estimates <- param_estimates %>%
  left_join(bootstrap_ci_effects, by = "label") %>%
  mutate(
    # Use bootstrap CIs for indirect/total effects, otherwise use regular CIs
    ci.lower.final = ifelse(!is.na(ci.lower.boot), round(ci.lower.boot, 3), ci.lower),
    ci.upper.final = ifelse(!is.na(ci.upper.boot), round(ci.upper.boot, 3), ci.upper),
    # Create CI string
    ci_95 = paste0("[", ci.lower.final, ", ", ci.upper.final, "]"),
    # Mark bootstrap CIs
    ci_type = ifelse(!is.na(ci.lower.boot), "Bootstrap", "Regular")
  )

# Separate regression paths and covariances
regression_paths <- param_estimates %>%
  filter(op == "~")

# Rename and select columns using base R
regression_paths <- regression_paths[, c("lhs", "rhs", "est", "std.all", "se", "z", "pvalue", 
                                        "ci.lower.final", "ci.upper.final", "ci_95", "ci_type", 
                                        "significance", "significant")]
names(regression_paths) <- c("outcome", "predictor", "est", "std.all", "se", "z", "pvalue", 
                            "ci.lower", "ci.upper", "ci_95", "ci_type", "significance", "significant")

covariances <- param_estimates %>%
  filter(op == "~~", lhs != rhs)

# Rename and select columns using base R
covariances <- covariances[, c("lhs", "rhs", "est", "std.all", "se", "z", "pvalue", 
                              "ci.lower.final", "ci.upper.final", "ci_95", "ci_type", 
                              "significance", "significant")]
names(covariances) <- c("var1", "var2", "est", "std.all", "se", "z", "pvalue", 
                       "ci.lower", "ci.upper", "ci_95", "ci_type", "significance", "significant")

# Extract defined parameters (indirect and total effects)
defined_effects <- param_estimates %>%
  filter(op == ":=") %>%
  select(label, est, std.all, se, z, pvalue, ci.lower.final, ci.upper.final, 
         ci_95, ci_type, significance, significant)

# Extract fit measures
fit_measures <- fitMeasures(fit_2)
fit_df <- data.frame(
  Measure = c("Chi-Square", "df", "p-value", "CFI", "TLI", "RMSEA", "RMSEA 90% CI Lower", 
              "RMSEA 90% CI Upper", "SRMR", "AIC", "BIC"),
  Value = c(
    round(fit_measures["chisq"], 3),
    fit_measures["df"],
    round(fit_measures["pvalue"], 3),
    round(fit_measures["cfi"], 3),
    round(fit_measures["tli"], 3),
    round(fit_measures["rmsea"], 3),
    round(fit_measures["rmsea.ci.lower"], 3),
    round(fit_measures["rmsea.ci.upper"], 3),
    round(fit_measures["srmr"], 3),
    round(fit_measures["aic"], 2),
    round(fit_measures["bic"], 2)
  )
)

# Display key results with bootstrap CIs
cat("\n=== INDIRECT EFFECTS (with Bootstrap 95% CIs) ===\n")
## 
## === INDIRECT EFFECTS (with Bootstrap 95% CIs) ===
indirect_effects <- defined_effects %>% filter(grepl("Indirect_", label))
for(i in 1:nrow(indirect_effects)) {
  cat(sprintf("%s: β = %.3f, %s CI %s, p = %.3f%s\n", 
              indirect_effects$label[i], 
              indirect_effects$est[i],
              indirect_effects$ci_type[i],
              indirect_effects$ci_95[i], 
              indirect_effects$pvalue[i],
              indirect_effects$significance[i]))
}
## Indirect_PW_CWR: β = 109.913, Bootstrap CI [30.882, 192.513], p = 0.009**
## Indirect_PW_CDICT: β = 20.882, Bootstrap CI [5.755, 37.545], p = 0.016*
## Indirect_PW_EDICT: β = 35.838, Bootstrap CI [9.135, 68.273], p = 0.020*
## Indirect_PW_EWR: β = 17.728, Bootstrap CI [4.473, 34.76], p = 0.027*
cat("\n=== TOTAL EFFECTS (with Bootstrap 95% CIs) ===\n")
## 
## === TOTAL EFFECTS (with Bootstrap 95% CIs) ===
total_effects <- defined_effects %>% filter(grepl("Total_", label))
for(i in 1:nrow(total_effects)) {
  cat(sprintf("%s: β = %.3f, %s CI %s, p = %.3f%s\n", 
              total_effects$label[i], 
              total_effects$est[i],
              total_effects$ci_type[i],
              total_effects$ci_95[i], 
              total_effects$pvalue[i],
              total_effects$significance[i]))
}
## Total_PW_CWR: β = 239.714, Bootstrap CI [75.671, 392.338], p = 0.002**
## Total_PW_CDICT: β = 62.534, Bootstrap CI [20.758, 106.939], p = 0.002**
## Total_PW_EDICT: β = 91.176, Bootstrap CI [23.941, 149.343], p = 0.009**
## Total_PW_EWR: β = 52.458, Bootstrap CI [21.478, 79.777], p = 0.001**
# Calculate R-squared values for each outcome variable
# Extract parameter estimates
params <- parameterEstimates(fit_2, standardized = TRUE)

# Function to calculate R-squared from lavaan model
calculate_r_squared <- function(fit, outcome_vars) {
  # Get the inspect function results
  r_squared_values <- inspect(fit, "r2")
  
  # Create a data frame with R-squared values
  if(is.null(r_squared_values)) {
    # If inspect doesn't work, calculate manually from residual variances
    # Get standardized solution
    std_solution <- standardizedSolution(fit)
    
    # Extract residual variances (these are 1 - R²)
    residual_vars <- std_solution %>%
      filter(op == "~~", lhs == rhs, lhs %in% outcome_vars) %>%
      select(lhs, est.std) %>%
      rename(variable = lhs, residual_var = est.std) %>%
      mutate(r_squared = 1 - residual_var)
    
    return(residual_vars)
  } else {
    # If inspect works, use those values
    r2_df <- data.frame(
      variable = names(r_squared_values),
      r_squared = as.numeric(r_squared_values)
    )
    return(r2_df)
  }
}

# Define outcome variables
outcome_vars <- c("CDRAN", "CWR", "CDICT", "EWR", "EDICT")

# Calculate R-squared values
r_squared_results <- calculate_r_squared(fit_2, outcome_vars)

# Display results
cat("\n=== R-SQUARED VALUES (Proportion of Variance Explained) ===\n")
## 
## === R-SQUARED VALUES (Proportion of Variance Explained) ===
print(r_squared_results)
##   variable r_squared
## 1    CDRAN 0.2736476
## 2      CWR 0.4783052
## 3    CDICT 0.2581940
## 4      EWR 0.5505111
## 5    EDICT 0.5349744