Note: I will need to make this into a function that takes the number of subjects and the number of b-values (and their names)

#first read in the json library containing the QC metrics
library(rjson)
library(reshape2)
library(plyr)
library(tidyverse)  # data manipulation
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.1     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(reshape2)
library(ggplot2)
library(gplots) 
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(readbulk)
library(ggthemr)
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'ggpubr'
## The following objects are masked from 'package:ggthemr':
## 
##     rotate_x_text, rotate_y_text
## The following object is masked from 'package:plyr':
## 
##     mutate
library(doBy)
library(Rmisc)
## Loading required package: lattice
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## The following object is masked from 'package:stats':
## 
##     cutree
library(dendextendRcpp)
## Loading required package: Rcpp
## 
## Attaching package: 'dendextendRcpp'
## The following object is masked from 'package:purrr':
## 
##     is_list
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(readr)
library("NbClust")
ggthemr("chalk")
result <- fromJSON(file = "/projects/janderson/PACTMD/pipelines/dwi/squad/group_db.json")

# Convert JSON file to a data frame.
json_data_frame <- lapply(result, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})
#Also read in a list of the subject IDs in the same order as the JSON library
sub_ids <- read_csv("/projects/janderson/PACTMD/pipelines/dwi/squad/subs.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_character()
## )
colnames(sub_ids) <- "ID"
#Now to extract the relevant information into separate objects

#now to look at motion
motion <- as.data.frame(json_data_frame$qc_motion)
colnames(motion) <- "motion"
mot_type <- as.data.frame(rep(1:2,285))
colnames(mot_type) <- "motion_type"
mot_type$motion_type <- as.factor(mot_type$motion_type)
mot_type$motion_type <- revalue(mot_type$motion_type, c("1"="abs", "2"="rel"))

temp_id_mot <- as.data.frame(rep(1:285,each=2))
colnames(temp_id_mot)<- "temp_id"

motion <- cbind(temp_id_mot, mot_type, motion)
motion_wide <- dcast(motion, temp_id ~ motion_type)
## Using motion as value column: use value.var to override.
#now to look at Noise
snr_cnr <- as.data.frame(json_data_frame$qc_cnr)
colnames(snr_cnr) <- "noise"
noise_type <- as.data.frame(rep(1:3,285))
colnames(noise_type) <- "noise_type"
noise_type$noise_type <- as.factor(noise_type$noise_type)
noise_type$noise_type <- revalue(noise_type$noise_type, c("1"="snr", "2"="cnr1000","3"= "cnr3000"))

temp_id_noise <- as.data.frame(rep(1:285,each=3))
colnames(temp_id_noise)<- "temp_id"

noise <- cbind(temp_id_noise, noise_type, snr_cnr)
noise_wide <- dcast(noise, temp_id ~ noise_type)
## Using noise as value column: use value.var to override.
#now to look at Outliers
outliers <- as.data.frame(json_data_frame$qc_outliers)
colnames(outliers) <- "outliers"
outlier_type <- as.data.frame(rep(1:4,285))
colnames(outlier_type) <- "outlier_type"
outlier_type$outlier_type <- as.factor(outlier_type$outlier_type)
outlier_type$outlier_type <- revalue(outlier_type$outlier_type, c("1"="total%", "2"="b1000","3"="b3000","4"="PE_dir"))

temp_id_outlier <- as.data.frame(rep(1:285,each=4))
colnames(temp_id_outlier)<- "temp_id"

outlier <- cbind(temp_id_outlier, outlier_type, outliers)
outlier_wide <- dcast(outlier, temp_id ~ outlier_type)
## Using outliers as value column: use value.var to override.
#now to merge everything

All_data <- Reduce(function(x, y) merge(x, y, all=TRUE), list(outlier_wide, noise_wide, motion_wide))

#now create logicals (if using absolute thresholds)
# All_data$total_fact <- as.numeric(All_data$`total%` > .2)
# All_data$snr_fact <- as.numeric(All_data$snr < 20)
# All_data$cnr_fact <- as.numeric(All_data$cnr < 1.4)
# All_data$abs_mot_fact <- as.numeric(All_data$abs > 1)
# All_data$rel_mot_fact <- as.numeric(All_data$rel > .4)
# All_data$Weighted_Score_Numeric <-  apply(All_data[9:13], 1, sum)
# All_data$Weighted_Score <- cut(x=All_data$Weighted_Score_Numeric, breaks = c(0,2,3,5),include.lowest =TRUE)
# levels(All_data$Weighted_Score) <- c('Pass', 'Caution','Fail')

#bind the subject IDs to the metrics
All_data <- cbind(sub_ids, All_data);
#write.csv(All_data,"/scratch/janderson/DWI_QC/squad/cutoff_scores_pond.csv")
head(All_data)

Load in the residuals from dwidenoise & merge to the dataframe

Noise <- read_csv("/projects/janderson/PACTMD/pipelines/dwi/squad/PACtMD_Noise.csv", col_names = TRUE)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   Noise = col_double()
## )
All_data <- merge(All_data, Noise, by = "ID")

Now to calculate the PC and clustering scores

data <- All_data[complete.cases(All_data), ]
ID <- data$ID
data <- subset(data, select=c("total%","snr","cnr1000", "rel","Noise"))

res.pca <- prcomp(data, scale = TRUE)


fviz_eig(res.pca)

fviz_pca_var(res.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

#kmeans solution (using the component scores from above)
d <- dist(res.pca$x, method = "euclidean") # distance matrix
#Now to try a K-means solution 
k2 <- kmeans(d, centers = 2, nstart = 25)
k3 <- kmeans(d, centers = 3, nstart = 25)
k4 <- kmeans(d, centers = 4, nstart = 25)
k5 <- kmeans(d, centers = 5, nstart = 25)
k10 <- kmeans(d, centers = 10, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = d) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = d) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = d) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = d) + ggtitle("k = 5")
#p10 <- fviz_cluster(k10, geom = "point",  data = d) + ggtitle("k = 10")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, p3, p4, nrow = 2)

### Distance Matrix
res.dist <- get_dist(res.pca$x, stand = FALSE, method = "euclidian")

fviz_dist(res.dist, 
   gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

res.hc <- res.pca$x %>%
  #scale() %>%                    # Scale the data
  dist(method = "euclidean") %>% # Compute dissimilarity matrix
  hclust(method = "ward.D2")     # Compute hierachical clustering
groups <- cutree(res.hc, k=3) # cut tree into 5 clusters
table(groups)
## groups
##   1   2   3 
## 136 122  27
# Visualize using factoextra
# Cut in 4 groups and color by groups
fviz_dend(res.hc, k = 3, # Cut in four groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),#, "#FC4E07"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE # Add rectangle around groups
          )

#Clustering tendency


res.nbclust <- res.pca$x %>%
 # scale() %>%
  NbClust(distance = "euclidean",
          min.nc = 2, max.nc = 10, 
          method = "complete", index ="all") 

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 1 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 8 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
library(factoextra)
fviz_nbclust(res.nbclust, ggtheme = theme_minimal())
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 8 proposed  2 as the best number of clusters
## * 1 proposed  3 as the best number of clusters
## * 3 proposed  4 as the best number of clusters
## * 8 proposed  5 as the best number of clusters
## * 1 proposed  6 as the best number of clusters
## * 1 proposed  9 as the best number of clusters
## * 1 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

# Enhanced hierarchical clustering, cut in 2 groups
res.hc <- data %>%
  scale() %>%
  eclust("hclust", k = 2, graph = FALSE)

# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
          rect = TRUE, show_labels = FALSE)

#inspect silhoutette plot
fviz_silhouette(res.hc)
##   cluster size ave.sil.width
## 1       1  258          0.52
## 2       2   27          0.33

# Silhouette width of observations
sil <- res.hc$silinfo$widths[, 1:3]