memory.limit(50000)
## [1] 50000
set.seed(123456)
library(ggplot2)
library(tsne)
library("cluster")
library("factoextra")
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library("magrittr")
library(factoextra)
library(NbClust)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.6.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(factoextra)
library(cluster)
library("ggpubr")
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:dendextend':
##
## rotate
library("mclust")
## Package 'mclust' version 5.4
## Type 'citation("mclust")' for citing this R package in publications.
library("fpc")
# Raw data
rdata = read.delim(file = 'https://www.ebi.ac.uk/~hamedhm/E-ERAD-401-query-results.tpms.tsv',
skip = 4,
check.names = TRUE)
r2 = na.omit(rdata)
r3 = r2[sample(seq(length(r2$Gene.Name)),
size = round(length(r2$Gene.Name) * .05),
# x% of data only
replace = FALSE), ]
r3 = na.omit(r3)
r4 = t(apply(r3[, -c(1:2)], 1, diff))
r4 = apply(r4, 2, function(x) {
x = na.omit(x)
x = scale(x)
#(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
})
r4 = as.data.frame(r4)
gnames = r3[, 1]
rownames(r4) = gnames
my_data <- r4
summary(my_data)
## X5.somite.stage X6.somite.stage X7.somite.stage
## Min. :-1.00829 Min. :-29.50470 Min. :-4.28542
## 1st Qu.:-0.03986 1st Qu.: 0.02573 1st Qu.:-0.11256
## Median :-0.03378 Median : 0.03099 Median :-0.03528
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.03075 3rd Qu.: 0.03361 3rd Qu.: 0.04200
## Max. :29.52936 Max. : 1.21599 Max. :24.77011
## X8.somite.stage X9.somite.stage X10.somite.stage
## Min. :-15.826894 Min. :-22.18664 Min. :-12.241858
## 1st Qu.: -0.028767 1st Qu.: -0.04756 1st Qu.: -0.025779
## Median : -0.010225 Median : -0.01097 Median : -0.009491
## Mean : 0.000000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.008318 3rd Qu.: 0.02562 3rd Qu.: 0.006797
## Max. : 21.443335 Max. : 12.54061 Max. : 25.237071
## X11.somite.stage X12.somite.stage X13.somite.stage
## Min. :-29.42513 Min. :-3.40204 Min. :-27.73480
## 1st Qu.: 0.01465 1st Qu.:-0.04845 1st Qu.: 0.03281
## Median : 0.02647 Median :-0.03579 Median : 0.05439
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.03829 3rd Qu.:-0.02314 3rd Qu.: 0.07597
## Max. : 1.49196 Max. :28.31153 Max. : 3.33386
## X14.somite.stage X15.somite.stage X16.somite.stage
## Min. :-1.71733 Min. :-28.82780 Min. :-17.98471
## 1st Qu.:-0.02644 1st Qu.: 0.00643 1st Qu.: -0.01938
## Median :-0.01854 Median : 0.02580 Median : 0.01484
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.01064 3rd Qu.: 0.02926 3rd Qu.: 0.08328
## Max. :29.42984 Max. : 2.40006 Max. : 10.45183
## X17.somite.stage X18.somite.stage X19.somite.stage
## Min. :-0.27269 Min. :-0.39904 Min. :-29.61108
## 1st Qu.:-0.03293 1st Qu.:-0.03368 1st Qu.: 0.03073
## Median :-0.03155 Median :-0.03183 Median : 0.03197
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.03114 3rd Qu.:-0.03146 3rd Qu.: 0.03445
## Max. :29.61223 Max. :29.60908 Max. : 0.32726
## X20.somite.stage X21.somite.stage X22.somite.stage
## Min. :-29.60551 Min. :-29.61236 Min. :-5.92649
## 1st Qu.: 0.03019 1st Qu.: 0.03135 1st Qu.:-0.04353
## Median : 0.03220 Median : 0.03165 Median :-0.02672
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.03587 3rd Qu.: 0.03240 3rd Qu.: 0.04051
## Max. : 0.52720 Max. : 0.23259 Max. :19.59434
## X23.somite.stage X24.somite.stage X25.somite.stage
## Min. :-0.45994 Min. :-29.58460 Min. :-23.31516
## 1st Qu.:-0.03350 1st Qu.: 0.03034 1st Qu.: 0.03282
## Median :-0.03050 Median : 0.03333 Median : 0.03545
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.03019 3rd Qu.: 0.03632 3rd Qu.: 0.06052
## Max. :29.60146 Max. : 0.85954 Max. : 3.92568
## X26.somite.stage X27.somite.stage X28.somite.stage
## Min. :-8.62408 Min. :-0.93022 Min. :-29.58763
## 1st Qu.:-0.06197 1st Qu.:-0.03107 1st Qu.: 0.03130
## Median :-0.02467 Median :-0.02951 Median : 0.03190
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.02280 3rd Qu.:-0.02920 3rd Qu.: 0.03581
## Max. :26.59436 Max. :29.58166 Max. : 0.59537
## X34.somite.stage X35.somite.stage X36.somite.stage
## Min. :-8.937189 Min. :-29.55033 Min. :-2.59727
## 1st Qu.:-0.021046 1st Qu.: 0.02575 1st Qu.:-0.03433
## Median :-0.013206 Median : 0.03136 Median :-0.02974
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.000336 3rd Qu.: 0.03697 3rd Qu.:-0.02516
## Max. :26.057424 Max. : 1.35608 Max. :29.42804
# PCA
pc1 = prcomp(x = my_data)
plot(pc1)

summary(pc1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 4.8237 1.28899 1.0405 0.63766 0.47922 0.40333
## Proportion of Variance 0.8618 0.06154 0.0401 0.01506 0.00851 0.00603
## Cumulative Proportion 0.8618 0.92331 0.9634 0.97848 0.98698 0.99301
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.26813 0.20968 0.15442 0.14006 0.11404 0.07181
## Proportion of Variance 0.00266 0.00163 0.00088 0.00073 0.00048 0.00019
## Cumulative Proportion 0.99567 0.99730 0.99818 0.99891 0.99939 0.99958
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.06383 0.05072 0.04011 0.03356 0.02760 2e-02
## Proportion of Variance 0.00015 0.00010 0.00006 0.00004 0.00003 1e-05
## Cumulative Proportion 0.99973 0.99983 0.99989 0.99993 0.99996 1e+00
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.01723 0.01563 0.01179 0.008018 0.005331 0.004428
## Proportion of Variance 0.00001 0.00001 0.00001 0.000000 0.000000 0.000000
## Cumulative Proportion 0.99998 0.99999 1.00000 1.000000 1.000000 1.000000
## PC25 PC26 PC27
## Standard deviation 0.003313 0.002636 0.0004946
## Proportion of Variance 0.000000 0.000000 0.0000000
## Cumulative Proportion 1.000000 1.000000 1.0000000
pc1$x = pc1$x[, 1:5] ### ONLY USE 3 COMPONENTS
library(ggplot2)
ggplot(data = as.data.frame(pc1$x), aes(x = PC1, y = PC2, color = gnames)) +
geom_point(show.legend = FALSE)

# t-SNE
library(tsne)
colors1 = rainbow(length(unique(rownames(r4))))
names(colors1) = unique(rownames(r4))
ecb = function(x, y) {
plot(x, t = 'p', pch = '.', col = colors1[rownames(r4)])
#text(x, labels = r3$Gene.Name, col = colors1[r3$Gene.Name])
}
tsne1 = tsne(
my_data,
epoch_callback = ecb,
max_iter = 500,
epoch = 100,
k = 5
) # ONLE 3 DIMENTIONS
## sigma summary: Min. : 0.120373928043866 |1st Qu. : 0.193039226659211 |Median : 0.253734620737499 |Mean : 0.280924845311122 |3rd Qu. : 0.329001567772986 |Max. : 0.971707581501539 |
## Epoch: Iteration #100 error is: 17.3388624231855
## Epoch: Iteration #200 error is: 1.12232892551054

## Epoch: Iteration #300 error is: 1.09224299258535

## Epoch: Iteration #400 error is: 1.08307831877043

## Epoch: Iteration #500 error is: 1.08005979439315


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1168257 62.4 2164898 115.7 1770749 94.6
## Vcells 3133605 24.0 10811558 82.5 13514448 103.2
# Cluster tendency measures
# Hopkins statistic: If the value of Hopkins statistic is close to zero (far below 0.5), then we can conclude that the dataset is significantly clusterable
gradient.color <- list(low = "steelblue", high = "white")
# Normal data
#my_data %>%
get_clust_tendency(my_data,
n = 25,
gradient = gradient.color,
graph = FALSE)
## $hopkins_stat
## [1] 0.007535291
##
## $plot
## NULL
# PCA
#pc1$x %>%
get_clust_tendency(pc1$x,
n = 25,
gradient = gradient.color,
graph = FALSE)
## $hopkins_stat
## [1] 0.004613762
##
## $plot
## NULL
# SNE
#tsne1 %>%
get_clust_tendency(tsne1,
n = 25,
gradient = gradient.color,
graph = FALSE)
## $hopkins_stat
## [1] 0.07995601
##
## $plot
## NULL
#####
# b = svm(x = tsne1, y = iris$Species)
# colors = rainbow(length(unique(fitted(b))))
# names(colors) = unique(fitted(b))
# plot(tsne1, col = colors[fitted(b)],pch='*')
# points(tsne1,col= colors1[iris$Species],pch='+')
#####
# Correlation plots
library("cluster")
library("factoextra")
library("magrittr")
res.dist <- get_dist(my_data, stand = FALSE, method = "pearson")
fviz_dist(
res.dist,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"),
show_labels = FALSE
)

# PCA
res.dist.pca <- get_dist(pc1$x, stand = FALSE, method = "pearson")
fviz_dist(
res.dist.pca,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"),
show_labels = FALSE
)

# SNE
res.dist.sne <- get_dist(tsne1, stand = FALSE, method = "pearson")
fviz_dist(
res.dist.sne,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"),
show_labels = FALSE
)

gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1213950 64.9 2164898 115.7 2164898 115.7
## Vcells 47377600 361.5 83391293 636.3 83267929 635.3
# Number of cluster estimation
library("factoextra")
k1 = fviz_nbclust(my_data, kmeans, method = "gap_stat")
plot(k1)

k2 = fviz_nbclust(my_data, kmeans, method = "silhouette")
plot(k2)

k3 = fviz_nbclust(my_data, kmeans, method = "wss")
plot(k3)

#pkgs <- c("factoextra", "NbClust")
#install.packages(pkgs)
library(factoextra)
library(NbClust)
# method
# the cluster analysis method to be used. This should be one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans".
# index
# the index to be calculated. This should be one of : "kl", "ch", "hartigan", "ccc", "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", "hubert", "sdindex", "dindex", "sdbw", "all" (all indices except GAP, Gamma, Gplus and Tau), "alllong" (all indices with Gap, Gamma, Gplus and Tau included).
nb <- NbClust(
my_data,
distance = "manhattan",
min.nc = 2,
max.nc = 12,
method = "kmeans"
)
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# Visualize
library(factoextra)
fviz_nbclust(nb, ggtheme = theme_minimal(),main='Actual data')
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 11 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

# PCA
nb.pca <- NbClust(
pc1$x,
distance = "manhattan",
min.nc = 2,
max.nc = 12,
method = "kmeans"
)
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 9 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# Visualize
library(factoextra)
fviz_nbclust(nb.pca, ggtheme = theme_minimal(),main='PCA')
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 9 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

# SNE
nb.sne <- NbClust(
tsne1,
distance = "manhattan",
min.nc = 2,
max.nc = 12,
method = "kmeans"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 5 proposed 6 as the best number of clusters
## * 3 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 11 as the best number of clusters
## * 2 proposed 12 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# Visualize
library(factoextra)
fviz_nbclust(nb.sne, ggtheme = theme_minimal(),main='SNE')
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 6 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 5 proposed 6 as the best number of clusters
## * 3 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 11 as the best number of clusters
## * 2 proposed 12 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

# Kmeans
set.seed(123)
k = 5 # Maximum number of clusters
km.res <- kmeans(my_data, k, nstart = 50)
# Visualize
library("factoextra")
fviz_cluster(
km.res,
data = my_data,
ellipse.type = "convex",
palette = "jco",
geom = 'point',
ggtheme = theme_minimal(),
main = 'Actual data'
)

### PCA kmean
km.res.pca <- kmeans(pc1$x, k, nstart = 50)
# Visualize
library("factoextra")
fviz_cluster(
km.res.pca,
data = pc1$x,
ellipse.type = "convex",
palette = "jco",
geom = 'point',
ggtheme = theme_minimal(),
main = 'PCA'
)

### SNE kmean
km.res.sne <- kmeans(tsne1, k, nstart = 50)
# Visualize
library("factoextra")
fviz_cluster(
km.res.sne,
data = tsne1,
ellipse.type = "convex",
palette = "jco",
geom = 'point',
ggtheme = theme_minimal(),
main='SNE'
)

###
# Compute PAM
library("cluster")
pam.res <- pam(my_data, k)
# Visualize
fviz_cluster(
pam.res,
ellipse.type = "convex",
palette = "jco",
geom = 'point',
ggtheme = theme_minimal(),main='Actual'
)

# Compute PCA PAM
library("cluster")
pam.res.pca <- pam(pc1$x, k)
# Visualize
fviz_cluster(
pam.res.pca,
ellipse.type = "convex",
palette = "jco",
geom = 'point',
ggtheme = theme_minimal(),main='PCA'
)

# Compute SNE PAM
library("cluster")
pam.res.sne <- pam(tsne1, k)
# Visualize
fviz_cluster(
pam.res.sne,
ellipse.type = "convex",
palette = "jco",
geom = 'point',
ggtheme = theme_minimal(),main = 'SNE'
)

# Compute hierarchical clustering
res.hc <-
hclust(dist(my_data, method = "euclidean") , method = "ward.D2")
# Visualize using factoextra
# Cut in k groups and color by groups
library(factoextra)
fviz_dend(
res.hc,
k = k,
k_colors = "jco",
type = "circular",
show_labels = FALSE,
rect = FALSE,
main = 'ACTUAL data'
)

#plot(diana(my_data))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1642589 87.8 2637877 140.9 2637877 140.9
## Vcells 40427411 308.5 83391293 636.3 83391293 636.3
# Compute hierarchical clustering + PCA
res.hc.pca = hclust(dist(pc1$x, method = "euclidean"), method = "ward.D2")
library(factoextra)
fviz_dend(
res.hc.pca,
k = k,
k_colors = "jco",
type = "circular",
show_labels = FALSE,
rect = FALSE,
main = 'PCA'
)

#plot(diana(my_data))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1644832 87.9 2637877 140.9 2637877 140.9
## Vcells 40551064 309.4 83391293 636.3 83391293 636.3
# Compute hierarchical clustering + SNE
res.hc.sne = hclust(dist(tsne1, method = "euclidean"), method = "ward.D2")
library(factoextra)
fviz_dend(
res.hc.sne,
k = k,
k_colors = "jco",
type = "circular",
show_labels = FALSE,
rect = FALSE,
main = 'SNE'
)

#plot(diana(my_data))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1649178 88.1 2637877 140.9 2637877 140.9
## Vcells 40631721 310.0 83391293 636.3 83391293 636.3
# # Compare models
# library(dendextend)
# # Compute distance matrix
# res.dist <- dist(my_data, method = "euclidean")
# # Compute 2 hierarchical clusterings
# hc1 <- hclust(res.dist, method = "average")
# hc2 <- hclust(res.dist, method = "ward.D2")
# # Create two dendrograms
# dend1 <- as.dendrogram (hc1)
# dend2 <- as.dendrogram (hc2)
# # Create a list to hold dendrograms
# dend_list <- dendlist(dend1, dend2)
# tanglegram(
# dend1,
# dend2,
# highlight_distinct_edges = FALSE,
# # Turn-off dashed lines
# common_subtrees_color_lines = FALSE,
# # Turn-off line colors
# common_subtrees_color_branches = TRUE,
# # Color common branches
# main = paste("entanglement =", round(entanglement(dend_list), 2))
# )
# #########
# Cut at level 6
#hcut(my_data, 6, graph = TRUE)
#
# library(factoextra)
# fviz_silhouette(res.hc.sne)
# Hybrid kmean
library(factoextra)
res.hk <- hkmeans(my_data, k)
# Visualize the tree
fviz_dend(
res.hk,
palette = "jco",
type = 'circular',
rect = TRUE,
rect_border = "jco",
rect_fill = TRUE,
show_labels = FALSE,
main = 'Actual data'
)

# Visualize the hkmeans final clusters
fviz_cluster(
res.hk,
palette = "jco",
repel = TRUE,
geom = 'point',
labelsize = 0,
ggtheme = theme_classic(),
main = 'Actual data'
)

# Hybrid kmean + PCA
library(factoextra)
res.hk.pca <- hkmeans(pc1$x, k)
fviz_dend(
res.hk.pca,
palette = "jco",
type = 'circular',
rect = TRUE,
rect_border = "jco",
rect_fill = TRUE,
show_labels = FALSE,
main = 'PCA'
)

fviz_cluster(
res.hk.pca,
palette = "jco",
repel = TRUE,
geom = 'point',
labelsize = 0,
ggtheme = theme_classic(),
main = 'PCA'
)

# Hybrid kmean + SNE
library(factoextra)
res.hk.sne <- hkmeans(tsne1, k)
fviz_dend(
res.hk.pca,
palette = "jco",
type = 'circular',
rect = TRUE,
rect_border = "jco",
rect_fill = TRUE,
show_labels = FALSE,
main = 'SNE'
)

fviz_cluster(
res.hk.pca,
palette = "jco",
repel = TRUE,
geom = 'point',
labelsize = 0,
ggtheme = theme_classic(),
main = 'SNE'
)

gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1714417 91.6 2637877 140.9 2637877 140.9
## Vcells 41011831 312.9 83391293 636.3 83391293 636.3
# Fuzzy clustering
library(cluster)
res.fanny = fanny(x = my_data, k = k)
fviz_cluster(
res.fanny,
ellipse.type = "norm",
repel = FALSE,
geom = 'point',
palette = "jco",
ggtheme = theme_minimal(),
legend = "right",
main = 'Actual data'
)

fviz_silhouette(
res.fanny,
palette = "jco",
ggtheme = theme_minimal(),
label = FALSE,
main = 'Actual data'
)
## cluster size ave.sil.width
## 1 1 502 0.50
## 2 2 92 -0.20
## 3 3 285 -0.64

# Fuzzy clustering + PCA
library(cluster)
res.fanny.pca = fanny(x = pc1$x, k = k)
fviz_cluster(
res.fanny.pca,
ellipse.type = "norm",
repel = FALSE,
geom = 'point',
palette = "jco",
ggtheme = theme_minimal(),
legend = "right",
main = 'PCA'
)

fviz_silhouette(
res.fanny.pca,
palette = "jco",
ggtheme = theme_minimal(),
label = FALSE,
main = 'PCA'
)
## cluster size ave.sil.width
## 1 1 481 0.58
## 2 2 158 -0.20
## 3 3 240 -0.66

# Fuzzy clustering + SNE
library(cluster)
res.fanny.sne = fanny(x = tsne1, k = k)
## Warning in fanny(x = tsne1, k = k): the memberships are all very close to
## 1/k. Maybe decrease 'memb.exp' ?
fviz_cluster(
res.fanny.sne,
ellipse.type = "norm",
repel = FALSE,
geom = 'point',
palette = "jco",
ggtheme = theme_minimal(),
legend = "right",
main = 'SNE'
)
## Too few points to calculate an ellipse

fviz_silhouette(
res.fanny.sne,
palette = "jco",
ggtheme = theme_minimal(),
label = FALSE,
main = 'SNE'
)
## cluster size ave.sil.width
## 1 1 864 0.06
## 2 2 13 0.32
## 3 3 2 0.61

# model based clustering
library("ggpubr")
ggscatter(as.data.frame(pc1$x), x = "PC1", y = "PC2") +
geom_density2d() + # Add 2D density
labs(title = 'PCA')

library("ggpubr")
ggscatter(as.data.frame(tsne1), x = "V1", y = "V2") +
geom_density2d() + # Add 2D density
labs(title = 'SNE')

# Model based Clustering
library("mclust")
mc <- Mclust(my_data)
summary(mc)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 5 components:
##
## log.likelihood n df BIC ICL
## 78884.35 879 1925 144719.5 144714
##
## Clustering table:
## 1 2 3 4 5
## 295 292 186 90 16
#plot(mc,what = 'classification',pch='.')
library(factoextra)
# BIC values used for choosing the number of clusters
fviz_mclust(mc, "BIC", palette = "jco")

# Classification: plot showing the clustering
fviz_mclust(
mc,
"classification",
geom = "point",
pointsize = 1.5,
palette = "jco",
ggtheme = theme_minimal(),
main = 'Actual data'
)

# Classification uncertainty
fviz_mclust(
mc,
"uncertainty",
palette = "jco",
ggtheme = theme_minimal(),
main = 'Actual data'
)

# Model based Clustering + PCA
library("mclust")
mc.pca <- Mclust(pc1$x)
summary(mc.pca)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 7 components:
##
## log.likelihood n df BIC ICL
## 5954.638 879 62 11488.99 11227.95
##
## Clustering table:
## 1 2 3 4 5 6 7
## 146 76 32 152 254 213 6
library(factoextra)
fviz_mclust(mc.pca, "BIC", palette = "jco", main = 'PCA')

fviz_mclust(
mc.pca,
"classification",
geom = "point",
pointsize = 1.5,
palette = "jco",
ggtheme = theme_minimal(),
main = 'PCA'
)

fviz_mclust(
mc.pca,
"uncertainty",
palette = "jco",
ggtheme = theme_minimal(),
main = 'PCA'
)

# Model based Clustering + SNE
library("mclust")
mc.sne <- Mclust(tsne1)
summary(mc.sne)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 8 components:
##
## log.likelihood n df BIC ICL
## -16539.76 879 139 -34021.77 -34185.5
##
## Clustering table:
## 1 2 3 4 5 6 7 8
## 91 125 47 166 107 107 183 53
library(factoextra)
# BIC values used for choosing the number of clusters
fviz_mclust(mc.sne, "BIC", palette = "jco", main = 'SNE')

# Classification: plot showing the clustering
fviz_mclust(
mc.sne,
"classification",
geom = "point",
pointsize = 1.5,
palette = "jco",
main = 'SNE'
)

# Classification uncertainty
fviz_mclust(mc.sne, "uncertainty", palette = "jco", main = 'SNE')

gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2200663 117.6 3886542 207.6 3205452 171.2
## Vcells 42393654 323.5 83391293 636.3 83391293 636.3
# DBSCAN
# install.packages("fpc")
# install.packages("dbscan")
# install.packages("factoextra")
library(factoextra)
set.seed(123)
# km.res <- kmeans(my_data, 5, nstart = 25)
# fviz_cluster(
# km.res,
# df,
# geom = "point",
# ellipse = FALSE,
# show.clust.cent = FALSE,
# palette = "jco",
# ggtheme = theme_classic()
# )
library("fpc")
set.seed(123)
db <- fpc::dbscan(my_data, eps = 10 ^ -1, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(
db,
data = my_data,
stand = FALSE,
ellipse = FALSE,
show.clust.cent = FALSE,
geom = "point",
palette = "jco",
ggtheme = theme_classic(),
main = 'Actual data'
)

print(db)
## dbscan Pts=879 MinPts=5 eps=0.1
## 0 1
## border 386 36
## seed 0 457
## total 386 493
# DBSCAN+PCA
library("fpc")
set.seed(123)
db.pca <- fpc::dbscan(pc1$x, eps = 10 ^ -1, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(
db.pca,
data = pc1$x,
stand = FALSE,
ellipse = FALSE,
show.clust.cent = FALSE,
geom = "point",
palette = "jco",
ggtheme = theme_classic(),
main = 'PCA'
)

print(db.pca)
## dbscan Pts=879 MinPts=5 eps=0.1
## 0 1
## border 229 39
## seed 0 611
## total 229 650
# DBSCAN+SNE
library("fpc")
set.seed(123)
db.sne <- fpc::dbscan(tsne1, eps = 10 ^ -1, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(
db.sne,
data = tsne1,
stand = FALSE,
ellipse = FALSE,
show.clust.cent = TRUE,
geom = "point",
palette = "jco",
ggtheme = theme_minimal(),
main = 'SNE'
)

print(db.sne)
## dbscan Pts=879 MinPts=5 eps=0.1
##
## 0
## 879