library(bruceR)

## Warning: package 'bruceR' was built under R version 4.3.3

## 
## bruceR (v2024.6)
## Broadly Useful Convenient and Efficient R functions
## 
## Packages also loaded:
## ✔ data.table ✔ emmeans
## ✔ dplyr      ✔ lmerTest
## ✔ tidyr      ✔ effectsize
## ✔ stringr    ✔ performance
## ✔ ggplot2    ✔ interactions
## 
## Main functions of `bruceR`:
## cc()             Describe()  TTEST()
## add()            Freq()      MANOVA()
## .mean()          Corr()      EMMEANS()
## set.wd()         Alpha()     PROCESS()
## import()         EFA()       model_summary()
## print_table()    CFA()       lavaan_summary()
## 
## For full functionality, please install all dependencies:
## install.packages("bruceR", dep=TRUE)
## 
## Online documentation:
## https://psychbruce.github.io/bruceR
## 
## To use this package in publications, please cite:
## Bao, H.-W.-S. (2024). bruceR: Broadly useful convenient and efficient R functions (Version 2024.6) [Computer software]. https://CRAN.R-project.org/package=bruceR

set.wd()

## ✔ Set working directory to "C:/Users/psyuser/Desktop/ARWA article"

set.seed(6)
# rm(list = ls())
library(tidyverse)

## Warning: package 'readr' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ Matrix::expand()      masks tidyr::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ Matrix::pack()        masks tidyr::pack()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ Matrix::unpack()      masks tidyr::unpack()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(tidyverse)
library(lmerTest)
library(conflicted)

data_income <- read.csv("./data/EEG_RS_lanExp_income_imputated_all_v5.csv")

# Center the variables
data_income <- data_income %>%
  dplyr::mutate(
    PW_c = as.numeric(scale(PW, center = TRUE, scale = FALSE)),
    Age_EEG_yr_c = as.numeric(scale(Age_EEG_yr, center = TRUE, scale = FALSE)),
    SES_c = as.numeric(scale(SES, center = TRUE, scale = FALSE)),
    CDRAN_c = as.numeric(scale(CDRAN, center = TRUE, scale = FALSE))
    # ARITH_c = scale(ARITH, center = TRUE, scale = FALSE)
  )

# keep only the columns data for the analysis
data_income <- data_income %>% 
  select(CWR, CDICT, ARITH, EWR, EDICT, CDRAN, PW_c, Age_EEG_yr_c, SES_c, CDRAN_c, FID, EEGID, Age_EEG_yr, PW, CF, exponent, offset)
  # %>% 
  # # exclude NaN value
  # drop_na() 

write.csv(data_income, file = "./data/model_data_180_v5.csv", row.names = F)
          
demo_info <- read.csv("./data/assessment-resting-state-v8-200-drop-duplicated.csv")

# left join data_income with demo_info by selected column age gender
demo_data_summary <- data_income %>%
  left_join(demo_info %>% select(EEGID, Gender), by = "EEGID",  suffix = c("", "")) %>% 
  summarise(count = n(), 
  mean_age = mean(Age_EEG_yr, na.rm = TRUE),
  gender_count = list(table(Gender))
  )
  
knitr::kable(demo_data_summary)

count	mean_age	gender_count
202	8.086683	83, 116

# Fit separate models for each response variable
model_CWR <- lmerTest::lmer(CWR ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID), data = data_income)
model_CDICT <- lmerTest::lmer(CDICT ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID), data = data_income)
model_ARITH <- lmerTest::lmer(ARITH ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID), data = data_income)
model_EWR <- lmerTest::lmer(EWR ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID), data = data_income)
model_EDICT <- lmerTest::lmer(EDICT ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID), data = data_income)

# Print the model summaries
summary(model_CWR)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CWR ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1625.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.67123 -0.45267  0.00596  0.39871  2.42828 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 577.4    24.03   
##  Residual             192.8    13.89   
## Number of obs: 181, groups:  FID, 101
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   75.3981     2.6220  97.4668  28.756  < 2e-16 ***
## PW_c         201.1653    65.0612 149.0165   3.092  0.00237 ** 
## Age_EEG_yr_c  21.3751     2.6198 103.9251   8.159  8.3e-13 ***
## SES_c          0.6042     1.7509  97.6286   0.345  0.73077    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   A_EEG_
## PW_c         0.011              
## Ag_EEG_yr_c -0.005  0.035       
## SES_c        0.042  0.056  0.051

summary(model_CDICT)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDICT ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1196.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3214 -0.4872  0.1039  0.5035  2.0731 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 36.91    6.075   
##  Residual             19.37    4.401   
## Number of obs: 183, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   25.3377     0.6895  95.8007  36.747  < 2e-16 ***
## PW_c          50.1023    19.1489 167.1021   2.616   0.0097 ** 
## Age_EEG_yr_c   3.8014     0.6954 100.6957   5.466 3.35e-07 ***
## SES_c          0.5019     0.4618  94.8883   1.087   0.2799    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   A_EEG_
## PW_c         0.020              
## Ag_EEG_yr_c -0.005  0.033       
## SES_c        0.035  0.068  0.052

summary(model_ARITH)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: ARITH ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 944.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7908 -0.4376 -0.0549  0.3863  3.8665 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 9.003    3.000   
##  Residual             4.727    2.174   
## Number of obs: 183, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)    7.8875     0.3406  99.7933  23.160  < 2e-16 ***
## PW_c          24.0772     9.4588 168.0732   2.545  0.01181 *  
## Age_EEG_yr_c   3.9716     0.3435 104.6348  11.563  < 2e-16 ***
## SES_c          0.7089     0.2281  98.8872   3.108  0.00246 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   A_EEG_
## PW_c         0.020              
## Ag_EEG_yr_c -0.005  0.033       
## SES_c        0.035  0.068  0.052

summary(model_EWR)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EWR ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1333.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3093 -0.4255 -0.0133  0.3923  3.0278 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 91.11    9.545   
##  Residual             37.49    6.123   
## Number of obs: 183, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   19.2808     1.0556  97.9781  18.264  < 2e-16 ***
## PW_c          60.6910    27.6733 159.8471   2.193   0.0297 *  
## Age_EEG_yr_c   7.3629     1.0620 103.6222   6.933 3.63e-10 ***
## SES_c          5.0626     0.7073  97.6992   7.158 1.53e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   A_EEG_
## PW_c         0.021              
## Ag_EEG_yr_c -0.002  0.032       
## SES_c        0.036  0.065  0.053

summary(model_EDICT)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EDICT ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1065.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3924 -0.4309 -0.1053  0.5054  2.7162 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 17.834   4.223   
##  Residual              9.802   3.131   
## Number of obs: 182, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)    7.5983     0.4824  98.9083  15.751  < 2e-16 ***
## PW_c          31.2985    13.6090 168.5731   2.300   0.0227 *  
## Age_EEG_yr_c   3.5538     0.4866 103.5935   7.304 5.96e-11 ***
## SES_c          2.0869     0.3231  97.9724   6.459 4.07e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   A_EEG_
## PW_c         0.025              
## Ag_EEG_yr_c -0.007  0.028       
## SES_c        0.033  0.063  0.054

# Load lavaan package
library(lavaan)

## Warning: package 'lavaan' was built under R version 4.3.3

## This is lavaan 0.6-18
## lavaan is FREE software! Please report any bugs.

# Define the mediation model
mediation_CWR <- '
  # Mediator model
  CDRAN_c ~ a * PW_c + Age_EEG_yr_c + SES_c

  # Outcome model
  CWR ~ b * CDRAN_c + c * PW_c + Age_EEG_yr_c + SES_c

  # Indirect effect (a * b)
  indirect := a * b

  # Total effect (c + a * b)
  total := c + (a * b)
'

# Fit the model
fit <- sem(mediation_CWR, data = data_income)

## Warning: lavaan->lav_data_full():  
##    some observed variances are (at least) a factor 1000 times larger than 
##    others; use varTable(fit) to investigate

# Summarize the results
summary(fit, standardized = TRUE)

## lavaan 0.6-18 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         9
## 
##                                                   Used       Total
##   Number of observations                           136         202
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CDRAN_c ~                                                             
##     PW_c       (a)    7.109    2.424    2.932    0.003    7.109    0.217
##     Ag_EEG_yr_        0.459    0.084    5.476    0.000    0.459    0.407
##     SES_c             0.131    0.044    2.957    0.003    0.131    0.220
##   CWR ~                                                                 
##     CDRAN_c    (b)   15.493    2.571    6.027    0.000   15.493    0.441
##     PW_c       (c)  138.376   74.941    1.846    0.065  138.376    0.120
##     Ag_EEG_yr_       14.041    2.778    5.054    0.000   14.041    0.354
##     SES_c            -1.076    1.369   -0.786    0.432   -1.076   -0.051
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CDRAN_c           0.607    0.074    8.246    0.000    0.607    0.738
##    .CWR             545.915   66.202    8.246    0.000  545.915    0.537
## 
## Defined Parameters:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     indirect        110.136   41.769    2.637    0.008  110.136    0.096
##     total           248.512   81.810    3.038    0.002  248.512    0.216

# Define the mediation model
mediation_CDICT <- '
  # Mediator model
  CDRAN_c ~ a * PW_c + Age_EEG_yr_c + SES_c

  # Outcome model
  CDICT ~ b * CDRAN_c + c * PW_c + Age_EEG_yr_c + SES_c

  # Indirect effect (a * b)
  indirect := a * b

  # Total effect (c + a * b)
  total := c + (a * b)
'

# Fit the model
fit <- sem(mediation_CDICT, data = data_income)

# Summarize the results
summary(fit, standardized = TRUE)

## lavaan 0.6-18 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         9
## 
##                                                   Used       Total
##   Number of observations                           138         202
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CDRAN_c ~                                                             
##     PW_c       (a)    6.984    2.369    2.949    0.003    6.984    0.216
##     Ag_EEG_yr_        0.457    0.083    5.513    0.000    0.457    0.406
##     SES_c             0.132    0.044    3.012    0.003    0.132    0.222
##   CDICT ~                                                               
##     CDRAN_c    (b)    2.896    0.735    3.942    0.000    2.896    0.338
##     PW_c       (c)   51.709   21.074    2.454    0.014   51.709    0.187
##     Ag_EEG_yr_        1.710    0.790    2.166    0.030    1.710    0.177
##     SES_c             0.145    0.390    0.373    0.709    0.145    0.029
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CDRAN_c           0.599    0.072    8.307    0.000    0.599    0.737
##    .CDICT            44.607    5.370    8.307    0.000   44.607    0.746
## 
## Defined Parameters:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     indirect         20.226    8.566    2.361    0.018   20.226    0.073
##     total            71.935   21.560    3.336    0.001   71.935    0.260

# Define the mediation model
mediation_EWR <- '
  # Mediator model
  CDRAN_c ~ a * PW_c + Age_EEG_yr_c + SES_c

  # Outcome model
  EWR ~ b * CDRAN_c + c * PW_c + Age_EEG_yr_c + SES_c

  # Indirect effect (a * b)
  indirect := a * b

  # Total effect (c + a * b)
  total := c + (a * b)
'

# Fit the model
fit <- sem(mediation_EWR, data = data_income)

# Summarize the results
summary(fit, standardized = TRUE)

## lavaan 0.6-18 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         9
## 
##                                                   Used       Total
##   Number of observations                           138         202
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CDRAN_c ~                                                             
##     PW_c       (a)    6.984    2.369    2.949    0.003    6.984    0.216
##     Ag_EEG_yr_        0.457    0.083    5.513    0.000    0.457    0.406
##     SES_c             0.132    0.044    3.012    0.003    0.132    0.222
##   EWR ~                                                                 
##     CDRAN_c    (b)    5.139    1.191    4.316    0.000    5.139    0.295
##     PW_c       (c)   38.193   34.159    1.118    0.264   38.193    0.068
##     Ag_EEG_yr_        4.796    1.280    3.748    0.000    4.796    0.245
##     SES_c             4.853    0.632    7.672    0.000    4.853    0.469
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CDRAN_c           0.599    0.072    8.307    0.000    0.599    0.737
##    .EWR             117.194   14.108    8.307    0.000  117.194    0.475
## 
## Defined Parameters:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     indirect         35.890   14.741    2.435    0.015   35.890    0.064
##     total            74.083   35.296    2.099    0.036   74.083    0.132

# Define the mediation model
mediation_EDICT <- '
  # Mediator model
  CDRAN_c ~ a * PW_c + Age_EEG_yr_c + SES_c

  # Outcome model
  EDICT ~ b * CDRAN_c + c * PW_c + Age_EEG_yr_c + SES_c

  # Indirect effect (a * b)
  indirect := a * b

  # Total effect (c + a * b)
  total := c + (a * b)
'

# Fit the model
fit <- sem(mediation_EDICT, data = data_income)

# Summarize the results
summary(fit, standardized = TRUE)

## lavaan 0.6-18 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         9
## 
##                                                   Used       Total
##   Number of observations                           137         202
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CDRAN_c ~                                                             
##     PW_c       (a)    6.976    2.386    2.923    0.003    6.976    0.215
##     Ag_EEG_yr_        0.457    0.083    5.474    0.000    0.457    0.404
##     SES_c             0.132    0.044    2.995    0.003    0.132    0.221
##   EDICT ~                                                               
##     CDRAN_c    (b)    2.503    0.552    4.539    0.000    2.503    0.311
##     PW_c       (c)   34.122   15.877    2.149    0.032   34.122    0.131
##     Ag_EEG_yr_        2.295    0.595    3.858    0.000    2.295    0.252
##     SES_c             2.087    0.294    7.101    0.000    2.087    0.435
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CDRAN_c           0.603    0.073    8.276    0.000    0.603    0.738
##    .EDICT            25.146    3.038    8.276    0.000   25.146    0.475
## 
## Defined Parameters:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     indirect         17.461    7.105    2.458    0.014   17.461    0.067
##     total            51.583   16.521    3.122    0.002   51.583    0.198

# # Mediator model: CDRAN_c as a function of PW_c
# mediator_model <- lm(CDRAN_c ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID), data = data_income)
# # Outcome model: XXX a function of PW_c and SES_c
# outcome_CWR <- lmerTest::lmer(CWR ~ PW_c + CDRAN_c + SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
# outcome_CDICT <- lmerTest::lmer(CDICT ~ PW_c + CDRAN_c + SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
# outcome_ARITH <- lmerTest::lmer(ARITH ~ PW_c + CDRAN_c + SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
# outcome_EWR <- lmerTest::lmer(EWR ~ PW_c + CDRAN_c + SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
# outcome_EDICT <- lmerTest::lmer(EDICT ~ PW_c + CDRAN_c + SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
# 
# # Perform mediation analysis
# mediate_CWR <- mediate(
#   model.m = mediator_model,  # Mediator model
#   model.y = outcome_CWR,  # Outcome model
#   treat = "PW_c",           # Predictor variable
#   mediator = "CDRAN_c",       # Mediator variable
#   boot = TRUE,              # Use bootstrapping for confidence intervals
#   sims = 1000               # Number of bootstrap simulations
# )
# 
# # Summarize the results
# summary(mediate_CWR)
# 
# # Perform mediation analysis
# mediate_CDICT <- mediate(
#   model.m = mediator_model,  # Mediator model
#   model.y = outcome_CDICT,  # Outcome model
#   treat = "PW_c",           # Predictor variable
#   mediator = "CDRAN_c",       # Mediator variable
#   boot = F,              # Use bootstrapping for confidence intervals
#   sims = 1000               # Number of bootstrap simulations
# )
# 
# # Summarize the results
# summary(mediate_CDICT)
# 
# # Perform mediation analysis
# mediate_ARITH <- mediate(
#   model.m = mediator_model,  # Mediator model
#   model.y = outcome_ARITH,  # Outcome model
#   treat = "PW_c",           # Predictor variable
#   mediator = "CDRAN_c",       # Mediator variable
#   boot = F,              # Use bootstrapping for confidence intervals
#   sims = 1000               # Number of bootstrap simulations
# )
# 
# # Summarize the results
# summary(mediate_ARITH)
# 
# # Perform mediation analysis
# mediate_EWR <- mediate(
#   model.m = mediator_model,  # Mediator model
#   model.y = outcome_EWR,  # Outcome model
#   treat = "PW_c",           # Predictor variable
#   mediator = "CDRAN_c",       # Mediator variable
#   boot = F,              # Use bootstrapping for confidence intervals
#   sims = 1000               # Number of bootstrap simulations
# )
# 
# # Summarize the results
# summary(mediate_EWR)
# 
# # Perform mediation analysis
# mediate_EDICT <- mediate(
#   model.m = mediator_model,  # Mediator model
#   model.y = outcome_EDICT,  # Outcome model
#   treat = "PW_c",           # Predictor variable
#   mediator = "CDRAN_c",       # Mediator variable
#   boot = F,              # Use bootstrapping for confidence intervals
#   sims = 1000               # Number of bootstrap simulations
# )
# 
# # Summarize the results
# summary(mediate_EDICT)

# Fit the model with PW as predictor, SES as moderator, and Age_EEG_yr as covariate
moderate_CWR_SES <- lmerTest::lmer(CWR ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
moderate_CDICT_SES <- lmerTest::lmer(CDICT ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
moderate_ARITH_SES <- lmerTest::lmer(ARITH ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
moderate_EWR_SES <- lmerTest::lmer(EWR ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)
moderate_EDICT_SES <- lmerTest::lmer(EDICT ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID), data = data_income)

# Print the model summary
summary(moderate_CWR_SES)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CWR ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1615.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.63654 -0.45293  0.02704  0.38155  2.41868 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 587.4    24.24   
##  Residual             191.3    13.83   
## Number of obs: 181, groups:  FID, 101
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   75.3091     2.6427  95.9331  28.497  < 2e-16 ***
## PW_c         192.4478    66.4999 154.2573   2.894  0.00436 ** 
## SES_c          0.7626     1.7826  99.2972   0.428  0.66971    
## Age_EEG_yr_c  21.4366     2.6377 102.4821   8.127 1.05e-12 ***
## PW_c:SES_c   -27.4350    45.5669 136.6754  -0.602  0.54812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   SES_c  A_EEG_
## PW_c         0.021                     
## SES_c        0.034  0.023              
## Ag_EEG_yr_c -0.006  0.028  0.055       
## PW_c:SES_c   0.051  0.209 -0.150 -0.033

summary(moderate_CDICT_SES)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDICT ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1189
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.27929 -0.47065  0.05754  0.50528  2.00731 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 37.26    6.104   
##  Residual             19.31    4.395   
## Number of obs: 183, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   25.3009     0.6933  95.4681  36.491  < 2e-16 ***
## PW_c          47.4916    19.4171 171.0016   2.446   0.0155 *  
## SES_c          0.5622     0.4698  98.1405   1.197   0.2343    
## Age_EEG_yr_c   3.8247     0.6983 100.2972   5.477 3.21e-07 ***
## PW_c:SES_c   -10.5681    13.5500 155.3846  -0.780   0.4366    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   SES_c  A_EEG_
## PW_c         0.031                     
## SES_c        0.024  0.039              
## Ag_EEG_yr_c -0.008  0.025  0.059       
## PW_c:SES_c   0.067  0.163 -0.165 -0.044

summary(moderate_ARITH_SES)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: ARITH ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 938.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7646 -0.4276 -0.0359  0.3866  3.8318 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 8.999    3.000   
##  Residual             4.770    2.184   
## Number of obs: 183, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)    7.8946     0.3416  99.9052  23.108  < 2e-16 ***
## PW_c          24.5736     9.6128 172.0946   2.556  0.01144 *  
## SES_c          0.6976     0.2315 102.5250   3.013  0.00326 ** 
## Age_EEG_yr_c   3.9667     0.3442 104.6136  11.526  < 2e-16 ***
## PW_c:SES_c     2.0137     6.7132 158.0059   0.300  0.76460    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   SES_c  A_EEG_
## PW_c         0.031                     
## SES_c        0.024  0.040              
## Ag_EEG_yr_c -0.008  0.025  0.059       
## PW_c:SES_c   0.067  0.161 -0.166 -0.044

summary(moderate_EWR_SES)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EWR ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1325.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2809 -0.4226 -0.0159  0.3915  3.0008 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 92.84    9.635   
##  Residual             37.11    6.092   
## Number of obs: 183, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   19.2320     1.0648  96.7680  18.062  < 2e-16 ***
## PW_c          56.6085    28.1326 164.3695   2.012   0.0458 *  
## SES_c          5.1420     0.7205  99.7030   7.137 1.57e-10 ***
## Age_EEG_yr_c   7.3888     1.0696 102.4256   6.908 4.26e-10 ***
## PW_c:SES_c   -14.2028    19.4599 146.8086  -0.730   0.4666    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   SES_c  A_EEG_
## PW_c         0.032                     
## SES_c        0.026  0.035              
## Ag_EEG_yr_c -0.005  0.024  0.059       
## PW_c:SES_c   0.063  0.184 -0.154 -0.040

summary(moderate_EDICT_SES)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EDICT ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##    Data: data_income
## 
## REML criterion at convergence: 1059.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3922 -0.4334 -0.1127  0.4956  2.7039 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  FID      (Intercept) 17.801   4.219   
##  Residual              9.907   3.148   
## Number of obs: 182, groups:  FID, 102
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)    7.6036     0.4836  98.0362  15.722  < 2e-16 ***
## PW_c          31.7582    13.7976 171.6855   2.302   0.0226 *  
## SES_c          2.0779     0.3277 100.5440   6.340 6.58e-09 ***
## Age_EEG_yr_c   3.5507     0.4873 102.6078   7.286 6.76e-11 ***
## PW_c:SES_c     1.6687     9.6785 158.2115   0.172   0.8633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) PW_c   SES_c  A_EEG_
## PW_c         0.034                     
## SES_c        0.022  0.038              
## Ag_EEG_yr_c -0.010  0.022  0.060       
## PW_c:SES_c   0.063  0.144 -0.164 -0.041

# Perform the likelihood ratio test
comparison_CWR_SES <- anova(model_CWR, moderate_CWR_SES)

## refitting model(s) with ML (instead of REML)

# Print the comparison results
print(comparison_CWR_SES)

## Data: data_income
## Models:
## model_CWR: CWR ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
## moderate_CWR_SES: CWR ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##                  npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model_CWR           6 1658.1 1677.3 -823.06   1646.1                     
## moderate_CWR_SES    7 1659.8 1682.2 -822.89   1645.8 0.3336  1     0.5636

# Perform the likelihood ratio test
comparison_CDICT_SES <- anova(model_CDICT, moderate_CDICT_SES)

## refitting model(s) with ML (instead of REML)

# Print the comparison results
print(comparison_CDICT_SES)

## Data: data_income
## Models:
## model_CDICT: CDICT ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
## moderate_CDICT_SES: CDICT ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##                    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model_CDICT           6 1218.8 1238.1 -603.43   1206.8                     
## moderate_CDICT_SES    7 1220.2 1242.7 -603.12   1206.2 0.6058  1     0.4364

# Perform the likelihood ratio test
comparison_ARITH_SES <- anova(model_ARITH, moderate_ARITH_SES)

## refitting model(s) with ML (instead of REML)

# Print the comparison results
print(comparison_ARITH_SES)

## Data: data_income
## Models:
## model_ARITH: ARITH ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
## moderate_ARITH_SES: ARITH ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##                    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model_ARITH           6 960.71 979.96 -474.35   948.71                     
## moderate_ARITH_SES    7 962.61 985.08 -474.31   948.61 0.0957  1     0.7571

# Perform the likelihood ratio test
comparison_EWR_SES <- anova(model_EWR, moderate_EWR_SES)

## refitting model(s) with ML (instead of REML)

# Print the comparison results
print(comparison_EWR_SES)

## Data: data_income
## Models:
## model_EWR: EWR ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
## moderate_EWR_SES: EWR ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##                  npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model_EWR           6 1359.1 1378.4 -673.57   1347.1                     
## moderate_EWR_SES    7 1360.6 1383.1 -673.32   1346.6 0.5022  1     0.4785

# Perform the likelihood ratio test
comparison_EDICT_SES <- anova(model_EDICT, moderate_EDICT_SES)

## refitting model(s) with ML (instead of REML)

# Print the comparison results
print(comparison_EDICT_SES)

## Data: data_income
## Models:
## model_EDICT: EDICT ~ PW_c + Age_EEG_yr_c + SES_c + (1 | FID)
## moderate_EDICT_SES: EDICT ~ PW_c * SES_c + Age_EEG_yr_c + (1 | FID)
##                    npar  AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)
## model_EDICT           6 1085 1104.2 -536.50     1073                    
## moderate_EDICT_SES    7 1087 1109.4 -536.48     1073 0.036  1     0.8494

{r get CF, exponent, offset data} # data <- read.csv("./data/EEG_RS_lanExp.csv") # # # Select the relevant columns and drop rows with NA values # data_selected <- data %>% # mutate(Age_EEG_yr = Age_EEG / 12) %>% # convert Age_EEG from months to years # # keep unique EEGID values # distinct(EEGID, .keep_all = TRUE) %>% # dplyr::select(CWR, CDICT, ARITH, EWR, EDICT, CF, PW, offset, exponent, CMOI, EMOI, FatherEducation, MotherEducation, Income, Age_EEG_yr, FID, EEGID) # # # Replace the original columns with the imputed data # data_selected$CMOI <- completed_data$CMOI # data_selected$EMOI <- completed_data$EMOI # data_selected$FatherEducation <- completed_data$FatherEducation # data_selected$MotherEducation <- completed_data$MotherEducation # data_selected$Income <- completed_data$Income # # # Select the relevant columns and drop rows with NA values # education_income_data <- data_selected %>% # select(EEGID, FatherEducation, MotherEducation, Income) %>% # drop_na() # # # Merge the PCA scores back into the original data # data_income <- data_selected %>% # left_join(pca_scores, by = "EEGID") # # # output the data_income to csv # write.csv(data_income, file = "./data/EEG_RS_lanExp_income_imputated_all_v3.csv", row.names = FALSE) #

# read CRF dataset
data_parent_raw <- read_excel("./data/CRF_ADATA_20230620.xlsx")

## Warning: Expecting logical in BQM1490 / R1490C1807: got '曾有懶眼需遮蓋眼睛'

## Warning: Expecting logical in BQM1491 / R1491C1807: got '曾有懶眼需遮蓋眼睛'

## Warning: Expecting logical in BQM1492 / R1492C1807: got '曾有懶眼需遮蓋眼睛'

## Warning: Expecting logical in BQM1493 / R1493C1807: got '曾有懶眼需遮蓋眼睛'

## Warning: Expecting logical in BQM1498 / R1498C1807: got
## '兩眼有白內障已動手術換晶片'

## Warning: Expecting logical in BQM1499 / R1499C1807: got
## '兩眼有白內障已動手術換晶片'

## Warning: Expecting logical in BQM1500 / R1500C1807: got
## '兩眼有白內障已動手術換晶片'

## Warning: Expecting logical in BQM1501 / R1501C1807: got
## '兩眼有白內障已動手術換晶片'

## Warning: Expecting logical in BQM1733 / R1733C1807: got
## '右眼有外斜視問題但幸運地未影響他的立體感建立閱讀距離視力正常'

## Warning: Expecting logical in BQM1734 / R1734C1807: got
## '右眼有外斜視問題但幸運地未影響他的立體感建立閱讀距離視力正常'

## Warning: Expecting logical in BQM1735 / R1735C1807: got
## '右眼有外斜視問題但幸運地未影響他的立體感建立閱讀距離視力正常'

data_parent_s <- data_parent_raw %>%
  dplyr::select(SubjectID, FID, EEGID,
                MSWR, MCDICT, MARITH,
                FSWR, FCDICT, FARITH, LearningProblem, LP1, LP2, LP3, LP4, LP5) %>%
  dplyr::filter(!(rowSums(is.na(select(., MSWR, MCDICT, MARITH, FSWR, FCDICT, FARITH))) == 6)) %>%
  dplyr::mutate(
    PSWR = case_when(
      is.na(MSWR) & is.na(FSWR) ~ NA_real_,  # If both are NA, PSWR is NA
      is.na(MSWR) ~ FSWR,                   # If MSWR is NA, PSWR is FSWR
      is.na(FSWR) ~ MSWR,                   # If FSWR is NA, PSWR is MSWR
      TRUE ~ (MSWR + FSWR) / 2              # If both have values, PSWR is the average
    ),
    PCDICT = case_when(
      is.na(MCDICT) & is.na(FCDICT) ~ NA_real_,
      is.na(MCDICT) ~ FCDICT,
      is.na(FCDICT) ~ MCDICT,
      TRUE ~ (MCDICT + FCDICT) / 2
    ),
    PARITH = case_when(
      is.na(MARITH) & is.na(FARITH) ~ NA_real_,
      is.na(MARITH) ~ FARITH,
      is.na(FARITH) ~ MARITH,
      TRUE ~ (MARITH + FARITH) / 2
    )
  ) %>%
  dplyr::mutate(
    PSWR_z = as.numeric(scale(PSWR)),  # Calculate z-score for PSWR
    PCDICT_z = as.numeric(scale(PCDICT)),  # Calculate z-score for PCDICT
    PARITH_z = as.numeric(scale(PARITH))  # Calculate z-score for PARITH
  )

# Summarize the mean and SD for PSWR, PCDICT, and PARITH
summary_stats <- data_parent_s %>%
  dplyr::summarize(
    PSWR_mean = mean(PSWR, na.rm = TRUE),
    PSWR_sd = sd(PSWR, na.rm = TRUE),
    PCDICT_mean = mean(PCDICT, na.rm = TRUE),
    PCDICT_sd = sd(PCDICT, na.rm = TRUE),
    PARITH_mean = mean(PARITH, na.rm = TRUE),
    PARITH_sd = sd(PARITH, na.rm = TRUE)
  )

# Create a subset where at least one of the z-scores is lower than -1
risk_data <- data_parent_s %>%
  dplyr::filter(
    PSWR_z < -1 | PCDICT_z < -1 | PARITH_z < -1
  ) %>%
  dplyr::select(FID, EEGID, PSWR_z, PCDICT_z, PARITH_z)

{r select risk child and do N-P test} # # read data from the tidy data # data_child <- read.csv("./data/model_data_180_v3.csv") # # data_child_c <- data_child %>% # dplyr::mutate( # CF_z = as.numeric(scale(CF)), # PW_z = as.numeric(scale(PW)), # exponent_z = as.numeric(scale(exponent)), # offset_z = as.numeric(scale(offset)) # ) # # # Function to perform regression and extract residuals # regress_and_add_residuals <- function(data, dependent_var, independent_var) { # # Perform the regression # model <- lm(reformulate(independent_var, response = dependent_var), data = data) # # # Extract the residuals and add them as a new column # residuals_name <- paste0(dependent_var, "_r") # # data <- data %>% # dplyr::mutate(!!residuals_name := residuals(model)) # # return(data) # } # # # List of dependent variables # dependent_vars <- c("CF_z", "PW_z") # # # Apply the function to each dependent variable # for (var in dependent_vars) { # data_child_c <- regress_and_add_residuals(data_child_c, var, "Age_EEG_yr") # } # # # Select risk child from child_data where EEGID and FID match with risk_data # risk_child <- data_child_c %>% # dplyr::inner_join(risk_data, by = c("EEGID", "FID")) # # # List of variables to plot # variables <- c("CF_z_r", "PW_z_r") # , "exponent_z_r", "offset_z_r" # # # exclude risk_child data from original dataset # whitney_data <- data_child_c %>% # dplyr::anti_join(risk_child, by = c("EEGID", "FID")) # # # Prepare combined data # combined_data <- bind_rows( # whitney_data %>% # select(EEGID, FID, CF_z_r, PW_z_r) %>% # pivot_longer(cols = c(CF_z_r, PW_z_r), # names_to = "Variable", # values_to = "Value") %>% # mutate(Group = "Normal"), # # risk_child %>% # select(EEGID, FID, CF_z_r, PW_z_r) %>% # pivot_longer(cols = c(CF_z_r, PW_z_r), # names_to = "Variable", # values_to = "Value") %>% # mutate(Group = "Risk") # ) # # # Create the combined plot # p1 <- ggplot(combined_data, aes(x = Value)) + # # Density plots # geom_density(aes(fill = Group), alpha = 0.2, adjust = 1.5) + # # # Faceting # facet_grid(rows = vars(Group), # cols = vars(Variable), # scales = "free_y", # switch = "y") + # # # Color scale # scale_fill_manual(values = c("Data" = "#ee9d5c", "Risk" = "#ff3500")) + # # # Theme adjustments # theme_bw() + # theme( # strip.placement = "outside", # strip.background = element_blank(), # panel.spacing = unit(0.5, "lines"), # axis.text.x = element_text(size = 12), # axis.title = element_text(size = 14), # plot.title = element_text(size = 16, face = "bold"), # legend.position = "none" # ) + # # # Labels # labs(title = "Combined Distribution of Variables", # x = "Z-score Values", # y = "Density") # # p1 # # # Function to perform the Mann-Whitney U test and return the results # perform_mann_whitney_test <- function(var, data1, data2) { # test_result <- wilcox.test(data1[[var]], data2[[var]], alternative = "two.sided", exact = FALSE) # return(test_result) # } # # # Apply the function to each variable in the list # test_results <- lapply(variables, function(var) { # perform_mann_whitney_test(var, whitney_data, risk_child) # }) # # # Print the test results for each variable # for (i in seq_along(variables)) { # cat("Mann-Whitney U test results for", variables[i], ":\n") # print(test_results[[i]]) # cat("\n") # } #

# read data from the tidy data
data_child <- read.csv("./data/model_data_180_v5.csv") %>% 
  drop_na()

data_child_c <- data_child %>%
  dplyr::mutate(
    CF_z = as.numeric(scale(CF)),
    PW_z = as.numeric(scale(PW)),
    exponent_z = as.numeric(scale(exponent)),
    CDRAN_z = as.numeric(scale(CDRAN))
  )

# Function to perform regression and extract residuals
regress_and_add_residuals <- function(data, dependent_var, independent_var) {
  # Perform the regression
  model <- lm(reformulate(independent_var, response = dependent_var), data = data)
  
  # Extract the residuals and add them as a new column
  residuals_name <- paste0(dependent_var, "_r")
  
  data <- data %>%
    dplyr::mutate(!!residuals_name := residuals(model))
  
  return(data)
}

# List of dependent variables
dependent_vars <- c("CDRAN_z", "CF_z", "PW_z")

# Apply the function to each dependent variable
for (var in dependent_vars) {
  data_child_c <- regress_and_add_residuals(data_child_c, var, "Age_EEG_yr")
}

# Select risk child from child_data where EEGID and FID match with risk_data
risk_child <- data_child_c %>%
  dplyr::inner_join(risk_data, by = c("EEGID", "FID"))

# List of variables to plot
variables <- c("CDRAN_z_r", "CF_z_r", "PW_z_r") # , "exponent_z_r", "offset_z_r"

# exclude risk_child data from original dataset
whitney_data <- data_child_c %>%
  dplyr::anti_join(risk_child, by = c("EEGID", "FID"))

# Prepare combined data
combined_data <- bind_rows(
  whitney_data %>% 
    select(EEGID, FID, CF_z_r, PW_z_r, CDRAN_z_r) %>% 
    pivot_longer(cols = c(CF_z_r, PW_z_r, CDRAN_z_r), 
                 names_to = "Variable", 
                 values_to = "Value") %>% 
    mutate(Group = "Normal"),
  
  risk_child %>% 
    select(EEGID, FID, CF_z_r, PW_z_r, CDRAN_z_r) %>% 
    pivot_longer(cols = c(CF_z_r, PW_z_r, CDRAN_z_r), 
                 names_to = "Variable", 
                 values_to = "Value") %>% 
    mutate(Group = "Risk")
)

# Create the combined plot
p1 <- ggplot(combined_data, aes(x = Value)) +
  # Density plots
  geom_density(aes(fill = Group), alpha = 0.2, adjust = 1.5) +
  
  # Faceting
  facet_grid(rows = vars(Group),
             cols = vars(Variable),
             scales = "free_y",
             switch = "y") +
  
  # Color scale
  scale_fill_manual(values = c("Data" = "#ee9d5c", "Risk" = "#ff3500")) +
  
  # Theme adjustments
  theme_bw() +
  theme(
    strip.placement = "outside",
    strip.background = element_blank(),
    panel.spacing = unit(0.5, "lines"),
    axis.text.x = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold"),
    legend.position = "none"
  ) +
  
  # Labels
  labs(title = "Combined Distribution of Variables",
       x = "Z-score Values",
       y = "Density")

p1

# Function to perform the Mann-Whitney U test and return the results
perform_mann_whitney_test <- function(var, data1, data2) {
  test_result <- wilcox.test(data1[[var]], data2[[var]], alternative = "two.sided", exact = FALSE)
  return(test_result)
}

# Apply the function to each variable in the list
test_results <- lapply(variables, function(var) {
  perform_mann_whitney_test(var, whitney_data, risk_child)
})

# Print the test results for each variable
for (i in seq_along(variables)) {
  cat("Mann-Whitney U test results for", variables[i], ":\n")
  print(test_results[[i]])
  cat("\n")
}

## Mann-Whitney U test results for CDRAN_z_r :
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data1[[var]] and data2[[var]]
## W = 647, p-value = 0.8225
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## Mann-Whitney U test results for CF_z_r :
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data1[[var]] and data2[[var]]
## W = 416, p-value = 0.08489
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## Mann-Whitney U test results for PW_z_r :
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data1[[var]] and data2[[var]]
## W = 841, p-value = 0.06192
## alternative hypothesis: true location shift is not equal to 0

ARWA analysis

Quentin Lam

2024-12-11