Note: I will need to make this into a function that takes the number of subjects and the number of b-values (and their names)
#first read in the json library containing the QC metrics
library(rjson)
library(reshape2)
library(plyr)
library(tidyverse) # data manipulation
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(reshape2)
library(ggplot2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(readbulk)
library(ggthemr)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'ggpubr'
## The following objects are masked from 'package:ggthemr':
##
## rotate_x_text, rotate_y_text
## The following object is masked from 'package:plyr':
##
## mutate
library(doBy)
library(Rmisc)
## Loading required package: lattice
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
library(dendextendRcpp)
## Loading required package: Rcpp
##
## Attaching package: 'dendextendRcpp'
## The following object is masked from 'package:purrr':
##
## is_list
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(readr)
library("NbClust")
ggthemr("chalk")
result <- fromJSON(file = "/projects/janderson/PACTMD/pipelines/dwi/squad/group_db.json")
# Convert JSON file to a data frame.
json_data_frame <- lapply(result, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
#Also read in a list of the subject IDs in the same order as the JSON library
sub_ids <- read_csv("/projects/janderson/PACTMD/pipelines/dwi/squad/subs.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_character()
## )
colnames(sub_ids) <- "ID"
#Now to extract the relevant information into separate objects
#now to look at motion
motion <- as.data.frame(json_data_frame$qc_motion)
colnames(motion) <- "motion"
mot_type <- as.data.frame(rep(1:2,285))
colnames(mot_type) <- "motion_type"
mot_type$motion_type <- as.factor(mot_type$motion_type)
mot_type$motion_type <- revalue(mot_type$motion_type, c("1"="abs", "2"="rel"))
temp_id_mot <- as.data.frame(rep(1:285,each=2))
colnames(temp_id_mot)<- "temp_id"
motion <- cbind(temp_id_mot, mot_type, motion)
motion_wide <- dcast(motion, temp_id ~ motion_type)
## Using motion as value column: use value.var to override.
#now to look at Noise
snr_cnr <- as.data.frame(json_data_frame$qc_cnr)
colnames(snr_cnr) <- "noise"
noise_type <- as.data.frame(rep(1:3,285))
colnames(noise_type) <- "noise_type"
noise_type$noise_type <- as.factor(noise_type$noise_type)
noise_type$noise_type <- revalue(noise_type$noise_type, c("1"="snr", "2"="cnr1000","3"= "cnr3000"))
temp_id_noise <- as.data.frame(rep(1:285,each=3))
colnames(temp_id_noise)<- "temp_id"
noise <- cbind(temp_id_noise, noise_type, snr_cnr)
noise_wide <- dcast(noise, temp_id ~ noise_type)
## Using noise as value column: use value.var to override.
#now to look at Outliers
outliers <- as.data.frame(json_data_frame$qc_outliers)
colnames(outliers) <- "outliers"
outlier_type <- as.data.frame(rep(1:4,285))
colnames(outlier_type) <- "outlier_type"
outlier_type$outlier_type <- as.factor(outlier_type$outlier_type)
outlier_type$outlier_type <- revalue(outlier_type$outlier_type, c("1"="total%", "2"="b1000","3"="b3000","4"="PE_dir"))
temp_id_outlier <- as.data.frame(rep(1:285,each=4))
colnames(temp_id_outlier)<- "temp_id"
outlier <- cbind(temp_id_outlier, outlier_type, outliers)
outlier_wide <- dcast(outlier, temp_id ~ outlier_type)
## Using outliers as value column: use value.var to override.
#now to merge everything
All_data <- Reduce(function(x, y) merge(x, y, all=TRUE), list(outlier_wide, noise_wide, motion_wide))
#now create logicals (if using absolute thresholds)
# All_data$total_fact <- as.numeric(All_data$`total%` > .2)
# All_data$snr_fact <- as.numeric(All_data$snr < 20)
# All_data$cnr_fact <- as.numeric(All_data$cnr < 1.4)
# All_data$abs_mot_fact <- as.numeric(All_data$abs > 1)
# All_data$rel_mot_fact <- as.numeric(All_data$rel > .4)
# All_data$Weighted_Score_Numeric <- apply(All_data[9:13], 1, sum)
# All_data$Weighted_Score <- cut(x=All_data$Weighted_Score_Numeric, breaks = c(0,2,3,5),include.lowest =TRUE)
# levels(All_data$Weighted_Score) <- c('Pass', 'Caution','Fail')
#bind the subject IDs to the metrics
All_data <- cbind(sub_ids, All_data);
#write.csv(All_data,"/scratch/janderson/DWI_QC/squad/cutoff_scores_pond.csv")
head(All_data)
Load in the residuals from dwidenoise & merge to the dataframe
Noise <- read_csv("/projects/janderson/PACTMD/pipelines/dwi/squad/PACtMD_Noise.csv", col_names = TRUE)
## Parsed with column specification:
## cols(
## ID = col_character(),
## Noise = col_double()
## )
All_data <- merge(All_data, Noise, by = "ID")
Now to calculate the PC and clustering scores
data <- All_data[complete.cases(All_data), ]
ID <- data$ID
data <- subset(data, select=c("total%","snr","cnr1000", "rel","Noise"))
res.pca <- prcomp(data, scale = TRUE)
fviz_eig(res.pca)
fviz_pca_var(res.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
#kmeans solution (using the component scores from above)
d <- dist(res.pca$x, method = "euclidean") # distance matrix
#Now to try a K-means solution
k2 <- kmeans(d, centers = 2, nstart = 25)
k3 <- kmeans(d, centers = 3, nstart = 25)
k4 <- kmeans(d, centers = 4, nstart = 25)
k5 <- kmeans(d, centers = 5, nstart = 25)
k10 <- kmeans(d, centers = 10, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = d) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = d) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = d) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = d) + ggtitle("k = 5")
#p10 <- fviz_cluster(k10, geom = "point", data = d) + ggtitle("k = 10")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)
### Distance Matrix
res.dist <- get_dist(res.pca$x, stand = FALSE, method = "euclidian")
fviz_dist(res.dist,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
res.hc <- res.pca$x %>%
#scale() %>% # Scale the data
dist(method = "euclidean") %>% # Compute dissimilarity matrix
hclust(method = "ward.D2") # Compute hierachical clustering
groups <- cutree(res.hc, k=3) # cut tree into 5 clusters
table(groups)
## groups
## 1 2 3
## 136 122 27
# Visualize using factoextra
# Cut in 4 groups and color by groups
fviz_dend(res.hc, k = 3, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),#, "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
#Clustering tendency
res.nbclust <- res.pca$x %>%
# scale() %>%
NbClust(distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "complete", index ="all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 1 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 8 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
library(factoextra)
fviz_nbclust(res.nbclust, ggtheme = theme_minimal())
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 8 proposed 2 as the best number of clusters
## * 1 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 8 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .
# Enhanced hierarchical clustering, cut in 2 groups
res.hc <- data %>%
scale() %>%
eclust("hclust", k = 2, graph = FALSE)
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
rect = TRUE, show_labels = FALSE)
#inspect silhoutette plot
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 258 0.52
## 2 2 27 0.33
# Silhouette width of observations
sil <- res.hc$silinfo$widths[, 1:3]