This is a notebook that will attempt to find a cluster solution using QC variables & build a Shiny App
library(tidyverse) # data manipulation
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(cluster) # clustering algorithms
## Warning: package 'cluster' was built under R version 3.5.2
library(factoextra) # clustering algorithms & visualization
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
library(gplots)
## Warning: package 'gplots' was built under R version 3.5.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(readbulk)
## Warning: package 'readbulk' was built under R version 3.5.2
library(ggthemr)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'ggpubr'
## The following objects are masked from 'package:ggthemr':
##
## rotate_x_text, rotate_y_text
library(doBy)
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
##
## mutate
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(dendextend)
## Warning: package 'dendextend' was built under R version 3.5.2
##
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
library(dendextendRcpp)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.5.2
##
## Attaching package: 'dendextendRcpp'
## The following object is masked from 'package:purrr':
##
## is_list
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.2
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
ggthemr("chalk")
## Warning: New theme missing the following elements: panel.grid, plot.tag,
## plot.tag.position
#all_thresholded<- read_csv("/Users/John/Dropbox/qc_solution/QC_DWI - Thresholds_Larger_DS.csv")
all_thresholded <- read_csv("/Users/John/Dropbox/qc_solution/QC_BrainHack/PACTMD_dataset.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## ID = col_character(),
## Weighted_Score_Numeric = col_character(),
## Weighted_Score = col_character(),
## PCI_Score = col_character(),
## Penultimate = col_character(),
## Age = col_character(),
## reserve = col_character(),
## group = col_character()
## )
## See spec(...) for full column specifications.
data <- subset(all_thresholded, select = c("total_percentage", "snr","cnr1000", "rel","Noise"))
#data <- subset(all_thresholded, select = c("snr","cnr1000", "rel","Noise"))
data <- data[complete.cases(data), ]
res.pca <- prcomp(data, scale = TRUE)
fviz_eig(res.pca)
fviz_pca_ind(res.pca,
geom.ind = "point", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_var(res.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
#Kmeans solution here
d <- dist(data, method = "euclidean") # distance matrix
#Now to try a K-means solution
k2 <- kmeans(d, centers = 2, nstart = 25)
k3 <- kmeans(d, centers = 3, nstart = 25)
k4 <- kmeans(d, centers = 4, nstart = 25)
k5 <- kmeans(d, centers = 5, nstart = 25)
k10 <- kmeans(d, centers = 10, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = d) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = d) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = d) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = d) + ggtitle("k = 5")
#p10 <- fviz_cluster(k10, geom = "point", data = d) + ggtitle("k = 10")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)
#try density based clustering first on the raw data:
# Compute DBSCAN using fpc package
#library("fpc")
set.seed(123)
db <- fpc::dbscan(data, eps = 0.15, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(db, data = data, stand = FALSE,
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point",palette = "jco", ggtheme = theme_classic())
#try density based clustering now on the PC scores:
PC_Scores <- res.pca$x
PC_Scores <- subset(PC_Scores, select=c("PC1","PC2"))
set.seed(123)
db <- fpc::dbscan(PC_Scores, eps = 0.15, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(db, data = PC_Scores, stand = FALSE,
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point",palette = "jco", ggtheme = theme_classic())
res.dist <- get_dist(data, stand = TRUE, method = "pearson")
fviz_dist(res.dist,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
# Compute hierarchical clustering
res.hc <- data %>%
scale() %>% # Scale the data
dist(method = "euclidean") %>% # Compute dissimilarity matrix
hclust(method = "ward.D2") # Compute hierachical clustering
# Visualize using factoextra
# Cut in 4 groups and color by groups
fviz_dend(res.hc, k = 3, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),#, "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
#get clustering tendency
gradient.color <- list(low = "steelblue", high = "white")
data %>% # Remove column 5 (Species)
scale() %>% # Scale variables
get_clust_tendency(n = 50, gradient = gradient.color)
## $hopkins_stat
## [1] 0.2293349
##
## $plot
#optimal number of scans
set.seed(123)
# Compute
library("NbClust")
res.nbclust <- data %>%
scale() %>%
NbClust(distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "complete", index ="all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 9 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
library(factoextra)
fviz_nbclust(res.nbclust, ggtheme = theme_minimal())
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 9 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .
#enhanced clustering...
set.seed(123)
# Enhanced hierarchical clustering, cut in 3 groups
res.hc <- data %>%
scale() %>%
eclust("hclust", k = 2, graph = FALSE)
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
rect = TRUE, show_labels = FALSE)
#inspect silhoutette plot
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 25 0.28
## 2 2 258 0.55
# Silhouette width of observations
sil <- res.hc$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
#alternative plotting
library(ggthemr)
ggthemr("dust")
## Warning: New theme missing the following elements: panel.grid, plot.tag,
## plot.tag.position
clust <- cutree(res.hc, k = 3)
fviz_cluster(list(data = data, cluster = clust)) ## from ‘factoextra’ package