GRF results

R Markdown

##############
# Q1: Use parents to predict child group
###############

## Results: CPA_z        -0.3701     0.1145  -3.233  0.00122 ** 
# Group == 1 (Children with dyslexia) has lower CPA_z than Group == 0 (TD Children)
library(tidyverse)

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(bruceR)

## Warning: package 'bruceR' was built under R version 4.3.3

## 
## bruceR (v2024.6)
## Broadly Useful Convenient and Efficient R functions
## 
## Packages also loaded:
## ✔ data.table ✔ emmeans
## ✔ dplyr      ✔ lmerTest
## ✔ tidyr      ✔ effectsize
## ✔ stringr    ✔ performance
## ✔ ggplot2    ✔ interactions
## 
## Main functions of `bruceR`:
## cc()             Describe()  TTEST()
## add()            Freq()      MANOVA()
## .mean()          Corr()      EMMEANS()
## set.wd()         Alpha()     PROCESS()
## import()         EFA()       model_summary()
## print_table()    CFA()       lavaan_summary()
## 
## For full functionality, please install all dependencies:
## install.packages("bruceR", dep=TRUE)
## 
## Online documentation:
## https://psychbruce.github.io/bruceR
## 
## To use this package in publications, please cite:
## Bao, H.-W.-S. (2024). bruceR: Broadly useful convenient and efficient R functions (Version 2024.6) [Computer software]. https://CRAN.R-project.org/package=bruceR

set.wd()

## ✔ Set working directory to "C:/Users/psyuser/Desktop/GRF Analysis"

Q1_Parent_data <- read_csv("Parent_Dataset_full.csv")

## Rows: 327 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): PID, CID
## dbl (22): Group, LD, RDN, RPN, CPM, CPA, CMA, CRF, LD_z, CRF_z, RDN_z, RPN_z...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Ensure the Group variable is a factor
Q1_Parent_data$Group <- as.factor(Q1_Parent_data$Group)

# Print the count of each label in the Group variable
group_counts <- table(Q1_Parent_data$Group)

print(group_counts)

## 
##   0   1 
## 115 212

# 0   1
# 115 212

# Fit the logistic regression model
Q1_m1_full <- glm(Group ~ CPA_m + CPM_m + CMA_m + LD_m + CRF_m + RDN_m + RPN_m, data = Q1_Parent_data, family = binomial)

# Summarize the model
summary(Q1_m1_full)

## 
## Call:
## glm(formula = Group ~ CPA_m + CPM_m + CMA_m + LD_m + CRF_m + 
##     RDN_m + RPN_m, family = binomial, data = Q1_Parent_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.139062   2.000969  -1.069 0.285064    
## CPA_m       -0.048570   0.013149  -3.694 0.000221 ***
## CPM_m        0.166397   0.129173   1.288 0.197684    
## CMA_m        0.016473   0.047122   0.350 0.726646    
## LD_m         0.008510   0.040736   0.209 0.834529    
## CRF_m        0.004150   0.004172   0.995 0.319798    
## RDN_m        0.084974   0.057322   1.482 0.138235    
## RPN_m       -0.003009   0.028766  -0.105 0.916689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 409.69  on 318  degrees of freedom
## Residual deviance: 392.34  on 311  degrees of freedom
##   (8 observations deleted due to missingness)
## AIC: 408.34
## 
## Number of Fisher Scoring iterations: 4

Q1_m1_final <- glm(Group ~ CPA_m, data = Q1_Parent_data, family = binomial)

# Summarize the model
summary(Q1_m1_final)

## 
## Call:
## glm(formula = Group ~ CPA_m, family = binomial, data = Q1_Parent_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.93411    0.15125   6.176 6.58e-10 ***
## CPA_m       -0.04255    0.01229  -3.463 0.000535 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 422.01  on 325  degrees of freedom
## Residual deviance: 409.80  on 324  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 413.8
## 
## Number of Fisher Scoring iterations: 4

##########################
# Q2: Parents severity predict children severity
##########################
Q2_parent_data <- read_csv("Dys_parent_data.csv")

## Rows: 98 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): PID, CID
## dbl (28): LD, RDN, RPN, CPM, CPA, CMA, CRF, LD_z, CRF_z, RDN_z, RPN_z, CPM_z...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Q2_child_data <- read_csv("Dys_child_data.csv")

## Rows: 49 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): CID
## dbl (22): Grade, LD, CRF, RDN, RPN, CPM, CPA, CMA, LD_z, CRF_z, RDN_z, RPN_z...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Select the relevant columns from the parent data
parent_groups <- Q2_parent_data %>%
  select(CID, LD_group_c, CRF_group_c, RDN_group_c, RPN_group_c, 
         CPM_group_c, CPA_group_c, CMA_group_c)

# Merge the parent group variables with the child data by CID
Q2_child_data <- Q2_child_data %>%
  left_join(parent_groups, by = "CID")

# Print the count number of each label in LD_group_c, CRF_group_c, RDN_group_c, RPN_group_c, CPM_group_c, CPA_group_c, CMA_group_c
group_counts <- Q2_child_data %>%
  select(LD_group_c, CRF_group_c, RDN_group_c, RPN_group_c, CPM_group_c, CPA_group_c, CMA_group_c) %>%
  map(table)

print(group_counts)

## $LD_group_c
## 
##  0  1  2 
## 24 48 16 
## 
## $CRF_group_c
## 
##  0  1  2 
## 38 48 12 
## 
## $RDN_group_c
## 
##  0  1  2 
## 48 42  8 
## 
## $RPN_group_c
## 
##  0  1  2 
## 46 46  6 
## 
## $CPM_group_c
## 
##  0  1  2 
## 70 22  6 
## 
## $CPA_group_c
## 
##  0  1  2 
## 58 30 10 
## 
## $CMA_group_c
## 
##  0  1  2 
## 26 40 14

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

cluster_data <- Q2_child_data %>% select(RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z)

# Determine the optimal number of clusters using the elbow method
wss_plot <- fviz_nbclust(cluster_data, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  labs(subtitle = "Elbow method")

## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools

# Determine the optimal number of clusters using the silhouette method
shil_plot <- fviz_nbclust(cluster_data, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

print(wss_plot)

print(shil_plot)

# Perform K-means clustering with 2 clusters for cluster_data
set.seed(123)  # Set seed for reproducibility

kmeans_clusters <- kmeans(cluster_data, centers = 2, nstart = 25)

# Swap the cluster assignments
kmeans_clusters$cluster <- ifelse(kmeans_clusters$cluster == 1, 2, 1)

# Add the cluster assignments to the original data
cluster_data <- cluster_data %>%
  mutate(cluster = kmeans_clusters$cluster)

# Create a new variable for shifted x-axis values
cluster_data <- cluster_data %>%
  mutate(cluster_shifted = ifelse(cluster == 1, 1.3, 1.7))

# Create a list to store the plots
plots <- list()

# Loop through each column in cluster_data (excluding the cluster column)
for (col in names(cluster_data)[-c(ncol(cluster_data), ncol(cluster_data) - 1)]) {
   p <- ggplot(cluster_data) +
    geom_jitter(aes_string(x = "cluster", y = col, color = "as.factor(cluster)"), width = 0.1, height = 0, show.legend = FALSE) +
    stat_summary(aes_string(x = "cluster_shifted", y = col), fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
    stat_summary(aes_string(x = "cluster_shifted", y = col), fun.data = mean_se, geom = "errorbar", width = 0.2, color = "black")+
    labs(x = col) +
    theme_minimal() +
    theme(
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.text.x = element_blank(),
      panel.grid.major = element_line(color = NA),
      panel.grid.minor = element_line(color = NA)
    )
  plots[[col]] <- p
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Create the title
title <- grid::textGrob("Children Clustering Profiles", gp = grid::gpar(fontsize = 20, fontface = "bold"))

# Arrange the title and the plots in a single row
pb <- grid.arrange(title, do.call(arrangeGrob, c(plots, ncol = length(plots))), ncol = 1, heights = c(0.1, 1))

ggsave("child_cluster_profiles.png", plot = pb,  width = 15, height = 10)

library(cluster)
library(factoextra)
library(gridExtra)

cluster_data_par <- Q2_parent_data %>% select(RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z) %>% 
  na.omit()

# Determine the optimal number of clusters using the elbow method
wss_plot <- fviz_nbclust(cluster_data_par, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  labs(subtitle = "Elbow method")

# Determine the optimal number of clusters using the silhouette method
shil_plot <- fviz_nbclust(cluster_data_par, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

print(wss_plot)

print(shil_plot)

# Perform K-means clustering with 2 clusters for cluster_data
set.seed(123)  # Set seed for reproducibility

kmeans_clusters <- kmeans(cluster_data_par, centers = 2, nstart = 25)

# Add the cluster assignments to the original data
cluster_data_par <- cluster_data_par %>%
  mutate(cluster = kmeans_clusters$cluster)

# Create a new variable for shifted x-axis values
cluster_data_par <- cluster_data_par %>%
  mutate(cluster_shifted = ifelse(cluster == 1, 1.3, 1.7))

# Create a list to store the plots
plots <- list()

# Loop through each column in cluster_data (excluding the cluster column)
for (col in names(cluster_data_par)[-c(ncol(cluster_data_par), ncol(cluster_data_par) - 1)]) {
   p <- ggplot(cluster_data_par) +
    geom_jitter(aes_string(x = "cluster", y = col, color = "as.factor(cluster)"), width = 0.1, height = 0, show.legend = FALSE) +
    stat_summary(aes_string(x = "cluster_shifted", y = col), fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
    stat_summary(aes_string(x = "cluster_shifted", y = col), fun.data = mean_se, geom = "errorbar", width = 0.2, color = "black")+
    labs(x = col) +
    theme_minimal() +
    theme(
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.text.x = element_blank(),
      panel.grid.major = element_line(color = NA),
      panel.grid.minor = element_line(color = NA)
    )
  plots[[col]] <- p
}

# Create the title
title <- grid::textGrob("Parent Clustering Profiles", gp = grid::gpar(fontsize = 20, fontface = "bold"))

# Arrange the title and the plots in a single row
pa <- grid.arrange(title, do.call(arrangeGrob, c(plots, ncol = length(plots))), ncol = 1, heights = c(0.1, 1))

ggsave("parent_cluster_profiles.png", plot = pa, width = 15, height = 10)

# Check clustering accuracy

# Add CID column to cluster_data_par
cluster_data_par <- Q2_parent_data %>%
  select(CID, RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z) %>%
  na.omit() %>%
  bind_cols(cluster_data_par)

## New names:
## • `RDN_z` -> `RDN_z...2`
## • `CPA_z` -> `CPA_z...3`
## • `CPM_z` -> `CPM_z...4`
## • `CMA_z` -> `CMA_z...5`
## • `CRF_z` -> `CRF_z...6`
## • `RPN_z` -> `RPN_z...7`
## • `LD_z` -> `LD_z...8`
## • `RDN_z` -> `RDN_z...9`
## • `CPA_z` -> `CPA_z...10`
## • `CPM_z` -> `CPM_z...11`
## • `CMA_z` -> `CMA_z...12`
## • `CRF_z` -> `CRF_z...13`
## • `RPN_z` -> `RPN_z...14`
## • `LD_z` -> `LD_z...15`

# Add CID column to cluster_data
cluster_data <- Q2_child_data %>%
  select(CID, RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z) %>%
  na.omit() %>%
  bind_cols(cluster_data)

## New names:
## • `RDN_z` -> `RDN_z...2`
## • `CPA_z` -> `CPA_z...3`
## • `CPM_z` -> `CPM_z...4`
## • `CMA_z` -> `CMA_z...5`
## • `CRF_z` -> `CRF_z...6`
## • `RPN_z` -> `RPN_z...7`
## • `LD_z` -> `LD_z...8`
## • `RDN_z` -> `RDN_z...9`
## • `CPA_z` -> `CPA_z...10`
## • `CPM_z` -> `CPM_z...11`
## • `CMA_z` -> `CMA_z...12`
## • `CRF_z` -> `CRF_z...13`
## • `RPN_z` -> `RPN_z...14`
## • `LD_z` -> `LD_z...15`

merged_data <- cluster_data %>%
  select(CID, child_cluster = cluster) %>%
  inner_join(cluster_data_par %>% select(CID, parent_cluster = cluster), by = "CID")

## Warning in inner_join(., cluster_data_par %>% select(CID, parent_cluster = cluster), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 5 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

accuracy <- mean(merged_data$child_cluster == merged_data$parent_cluster)
print(paste("Clustering accuracy:", accuracy))

## [1] "Clustering accuracy: 0.584269662921348"

GRF results

Quentin Lam

2024-11-18

R Markdown