R Markdown
##############
# Q1: Use parents to predict child group
###############
## Results: CPA_z -0.3701 0.1145 -3.233 0.00122 **
# Group == 1 (Children with dyslexia) has lower CPA_z than Group == 0 (TD Children)
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(bruceR)
## Warning: package 'bruceR' was built under R version 4.3.3
##
## bruceR (v2024.6)
## Broadly Useful Convenient and Efficient R functions
##
## Packages also loaded:
## ✔ data.table ✔ emmeans
## ✔ dplyr ✔ lmerTest
## ✔ tidyr ✔ effectsize
## ✔ stringr ✔ performance
## ✔ ggplot2 ✔ interactions
##
## Main functions of `bruceR`:
## cc() Describe() TTEST()
## add() Freq() MANOVA()
## .mean() Corr() EMMEANS()
## set.wd() Alpha() PROCESS()
## import() EFA() model_summary()
## print_table() CFA() lavaan_summary()
##
## For full functionality, please install all dependencies:
## install.packages("bruceR", dep=TRUE)
##
## Online documentation:
## https://psychbruce.github.io/bruceR
##
## To use this package in publications, please cite:
## Bao, H.-W.-S. (2024). bruceR: Broadly useful convenient and efficient R functions (Version 2024.6) [Computer software]. https://CRAN.R-project.org/package=bruceR
set.wd()
## ✔ Set working directory to "C:/Users/psyuser/Desktop/GRF Analysis"
Q1_Parent_data <- read_csv("Parent_Dataset_full.csv")
## Rows: 327 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): PID, CID
## dbl (22): Group, LD, RDN, RPN, CPM, CPA, CMA, CRF, LD_z, CRF_z, RDN_z, RPN_z...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Ensure the Group variable is a factor
Q1_Parent_data$Group <- as.factor(Q1_Parent_data$Group)
# Print the count of each label in the Group variable
group_counts <- table(Q1_Parent_data$Group)
print(group_counts)
##
## 0 1
## 115 212
# 0 1
# 115 212
# Fit the logistic regression model
Q1_m1_full <- glm(Group ~ CPA_m + CPM_m + CMA_m + LD_m + CRF_m + RDN_m + RPN_m, data = Q1_Parent_data, family = binomial)
# Summarize the model
summary(Q1_m1_full)
##
## Call:
## glm(formula = Group ~ CPA_m + CPM_m + CMA_m + LD_m + CRF_m +
## RDN_m + RPN_m, family = binomial, data = Q1_Parent_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.139062 2.000969 -1.069 0.285064
## CPA_m -0.048570 0.013149 -3.694 0.000221 ***
## CPM_m 0.166397 0.129173 1.288 0.197684
## CMA_m 0.016473 0.047122 0.350 0.726646
## LD_m 0.008510 0.040736 0.209 0.834529
## CRF_m 0.004150 0.004172 0.995 0.319798
## RDN_m 0.084974 0.057322 1.482 0.138235
## RPN_m -0.003009 0.028766 -0.105 0.916689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.69 on 318 degrees of freedom
## Residual deviance: 392.34 on 311 degrees of freedom
## (8 observations deleted due to missingness)
## AIC: 408.34
##
## Number of Fisher Scoring iterations: 4
Q1_m1_final <- glm(Group ~ CPA_m, data = Q1_Parent_data, family = binomial)
# Summarize the model
summary(Q1_m1_final)
##
## Call:
## glm(formula = Group ~ CPA_m, family = binomial, data = Q1_Parent_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.93411 0.15125 6.176 6.58e-10 ***
## CPA_m -0.04255 0.01229 -3.463 0.000535 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 422.01 on 325 degrees of freedom
## Residual deviance: 409.80 on 324 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 413.8
##
## Number of Fisher Scoring iterations: 4
##########################
# Q2: Parents severity predict children severity
##########################
Q2_parent_data <- read_csv("Dys_parent_data.csv")
## Rows: 98 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): PID, CID
## dbl (28): LD, RDN, RPN, CPM, CPA, CMA, CRF, LD_z, CRF_z, RDN_z, RPN_z, CPM_z...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Q2_child_data <- read_csv("Dys_child_data.csv")
## Rows: 49 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): CID
## dbl (22): Grade, LD, CRF, RDN, RPN, CPM, CPA, CMA, LD_z, CRF_z, RDN_z, RPN_z...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Select the relevant columns from the parent data
parent_groups <- Q2_parent_data %>%
select(CID, LD_group_c, CRF_group_c, RDN_group_c, RPN_group_c,
CPM_group_c, CPA_group_c, CMA_group_c)
# Merge the parent group variables with the child data by CID
Q2_child_data <- Q2_child_data %>%
left_join(parent_groups, by = "CID")
# Print the count number of each label in LD_group_c, CRF_group_c, RDN_group_c, RPN_group_c, CPM_group_c, CPA_group_c, CMA_group_c
group_counts <- Q2_child_data %>%
select(LD_group_c, CRF_group_c, RDN_group_c, RPN_group_c, CPM_group_c, CPA_group_c, CMA_group_c) %>%
map(table)
print(group_counts)
## $LD_group_c
##
## 0 1 2
## 24 48 16
##
## $CRF_group_c
##
## 0 1 2
## 38 48 12
##
## $RDN_group_c
##
## 0 1 2
## 48 42 8
##
## $RPN_group_c
##
## 0 1 2
## 46 46 6
##
## $CPM_group_c
##
## 0 1 2
## 70 22 6
##
## $CPA_group_c
##
## 0 1 2
## 58 30 10
##
## $CMA_group_c
##
## 0 1 2
## 26 40 14
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
cluster_data <- Q2_child_data %>% select(RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z)
# Determine the optimal number of clusters using the elbow method
wss_plot <- fviz_nbclust(cluster_data, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2) +
labs(subtitle = "Elbow method")
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
# Determine the optimal number of clusters using the silhouette method
shil_plot <- fviz_nbclust(cluster_data, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")
print(wss_plot)

print(shil_plot)

# Perform K-means clustering with 2 clusters for cluster_data
set.seed(123) # Set seed for reproducibility
kmeans_clusters <- kmeans(cluster_data, centers = 2, nstart = 25)
# Swap the cluster assignments
kmeans_clusters$cluster <- ifelse(kmeans_clusters$cluster == 1, 2, 1)
# Add the cluster assignments to the original data
cluster_data <- cluster_data %>%
mutate(cluster = kmeans_clusters$cluster)
# Create a new variable for shifted x-axis values
cluster_data <- cluster_data %>%
mutate(cluster_shifted = ifelse(cluster == 1, 1.3, 1.7))
# Create a list to store the plots
plots <- list()
# Loop through each column in cluster_data (excluding the cluster column)
for (col in names(cluster_data)[-c(ncol(cluster_data), ncol(cluster_data) - 1)]) {
p <- ggplot(cluster_data) +
geom_jitter(aes_string(x = "cluster", y = col, color = "as.factor(cluster)"), width = 0.1, height = 0, show.legend = FALSE) +
stat_summary(aes_string(x = "cluster_shifted", y = col), fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
stat_summary(aes_string(x = "cluster_shifted", y = col), fun.data = mean_se, geom = "errorbar", width = 0.2, color = "black")+
labs(x = col) +
theme_minimal() +
theme(
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
panel.grid.major = element_line(color = NA),
panel.grid.minor = element_line(color = NA)
)
plots[[col]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Create the title
title <- grid::textGrob("Children Clustering Profiles", gp = grid::gpar(fontsize = 20, fontface = "bold"))
# Arrange the title and the plots in a single row
pb <- grid.arrange(title, do.call(arrangeGrob, c(plots, ncol = length(plots))), ncol = 1, heights = c(0.1, 1))

ggsave("child_cluster_profiles.png", plot = pb, width = 15, height = 10)
library(cluster)
library(factoextra)
library(gridExtra)
cluster_data_par <- Q2_parent_data %>% select(RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z) %>%
na.omit()
# Determine the optimal number of clusters using the elbow method
wss_plot <- fviz_nbclust(cluster_data_par, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2) +
labs(subtitle = "Elbow method")
# Determine the optimal number of clusters using the silhouette method
shil_plot <- fviz_nbclust(cluster_data_par, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")
print(wss_plot)

print(shil_plot)

# Perform K-means clustering with 2 clusters for cluster_data
set.seed(123) # Set seed for reproducibility
kmeans_clusters <- kmeans(cluster_data_par, centers = 2, nstart = 25)
# Add the cluster assignments to the original data
cluster_data_par <- cluster_data_par %>%
mutate(cluster = kmeans_clusters$cluster)
# Create a new variable for shifted x-axis values
cluster_data_par <- cluster_data_par %>%
mutate(cluster_shifted = ifelse(cluster == 1, 1.3, 1.7))
# Create a list to store the plots
plots <- list()
# Loop through each column in cluster_data (excluding the cluster column)
for (col in names(cluster_data_par)[-c(ncol(cluster_data_par), ncol(cluster_data_par) - 1)]) {
p <- ggplot(cluster_data_par) +
geom_jitter(aes_string(x = "cluster", y = col, color = "as.factor(cluster)"), width = 0.1, height = 0, show.legend = FALSE) +
stat_summary(aes_string(x = "cluster_shifted", y = col), fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
stat_summary(aes_string(x = "cluster_shifted", y = col), fun.data = mean_se, geom = "errorbar", width = 0.2, color = "black")+
labs(x = col) +
theme_minimal() +
theme(
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
panel.grid.major = element_line(color = NA),
panel.grid.minor = element_line(color = NA)
)
plots[[col]] <- p
}
# Create the title
title <- grid::textGrob("Parent Clustering Profiles", gp = grid::gpar(fontsize = 20, fontface = "bold"))
# Arrange the title and the plots in a single row
pa <- grid.arrange(title, do.call(arrangeGrob, c(plots, ncol = length(plots))), ncol = 1, heights = c(0.1, 1))

ggsave("parent_cluster_profiles.png", plot = pa, width = 15, height = 10)
# Check clustering accuracy
# Add CID column to cluster_data_par
cluster_data_par <- Q2_parent_data %>%
select(CID, RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z) %>%
na.omit() %>%
bind_cols(cluster_data_par)
## New names:
## • `RDN_z` -> `RDN_z...2`
## • `CPA_z` -> `CPA_z...3`
## • `CPM_z` -> `CPM_z...4`
## • `CMA_z` -> `CMA_z...5`
## • `CRF_z` -> `CRF_z...6`
## • `RPN_z` -> `RPN_z...7`
## • `LD_z` -> `LD_z...8`
## • `RDN_z` -> `RDN_z...9`
## • `CPA_z` -> `CPA_z...10`
## • `CPM_z` -> `CPM_z...11`
## • `CMA_z` -> `CMA_z...12`
## • `CRF_z` -> `CRF_z...13`
## • `RPN_z` -> `RPN_z...14`
## • `LD_z` -> `LD_z...15`
# Add CID column to cluster_data
cluster_data <- Q2_child_data %>%
select(CID, RDN_z, CPA_z, CPM_z, CMA_z, CRF_z, RPN_z, LD_z) %>%
na.omit() %>%
bind_cols(cluster_data)
## New names:
## • `RDN_z` -> `RDN_z...2`
## • `CPA_z` -> `CPA_z...3`
## • `CPM_z` -> `CPM_z...4`
## • `CMA_z` -> `CMA_z...5`
## • `CRF_z` -> `CRF_z...6`
## • `RPN_z` -> `RPN_z...7`
## • `LD_z` -> `LD_z...8`
## • `RDN_z` -> `RDN_z...9`
## • `CPA_z` -> `CPA_z...10`
## • `CPM_z` -> `CPM_z...11`
## • `CMA_z` -> `CMA_z...12`
## • `CRF_z` -> `CRF_z...13`
## • `RPN_z` -> `RPN_z...14`
## • `LD_z` -> `LD_z...15`
merged_data <- cluster_data %>%
select(CID, child_cluster = cluster) %>%
inner_join(cluster_data_par %>% select(CID, parent_cluster = cluster), by = "CID")
## Warning in inner_join(., cluster_data_par %>% select(CID, parent_cluster = cluster), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 5 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
accuracy <- mean(merged_data$child_cluster == merged_data$parent_cluster)
print(paste("Clustering accuracy:", accuracy))
## [1] "Clustering accuracy: 0.584269662921348"