R_code.R

memory.limit(50000)

## [1] 50000

set.seed(123456)
library(ggplot2)
library(tsne)
library("cluster")
library("factoextra")

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library("magrittr")
library(factoextra)
library(NbClust)
library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.6.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(factoextra)
library(cluster)
library("ggpubr")

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:dendextend':
## 
##     rotate

library("mclust")

## Package 'mclust' version 5.4
## Type 'citation("mclust")' for citing this R package in publications.

library("fpc")


# Raw data
rdata = read.delim(file = 'https://www.ebi.ac.uk/~hamedhm/E-ERAD-401-query-results.tpms.tsv',
                                     skip = 4,
                                     check.names = TRUE)
r2    = na.omit(rdata)
r3    = r2[sample(seq(length(r2$Gene.Name)),
                                    size = round(length(r2$Gene.Name) * .05),
                                    # x% of data only
                                    replace = FALSE), ]
r3  = na.omit(r3)
r4  = t(apply(r3[, -c(1:2)], 1, diff))
r4  = apply(r4, 2, function(x) {
    x = na.omit(x)
    x = scale(x)
    #(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
})
r4 = as.data.frame(r4)
gnames = r3[, 1]
rownames(r4) = gnames

my_data <- r4
summary(my_data)

##  X5.somite.stage    X6.somite.stage     X7.somite.stage   
##  Min.   :-1.00829   Min.   :-29.50470   Min.   :-4.28542  
##  1st Qu.:-0.03986   1st Qu.:  0.02573   1st Qu.:-0.11256  
##  Median :-0.03378   Median :  0.03099   Median :-0.03528  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:-0.03075   3rd Qu.:  0.03361   3rd Qu.: 0.04200  
##  Max.   :29.52936   Max.   :  1.21599   Max.   :24.77011  
##  X8.somite.stage      X9.somite.stage     X10.somite.stage    
##  Min.   :-15.826894   Min.   :-22.18664   Min.   :-12.241858  
##  1st Qu.: -0.028767   1st Qu.: -0.04756   1st Qu.: -0.025779  
##  Median : -0.010225   Median : -0.01097   Median : -0.009491  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   :  0.000000  
##  3rd Qu.:  0.008318   3rd Qu.:  0.02562   3rd Qu.:  0.006797  
##  Max.   : 21.443335   Max.   : 12.54061   Max.   : 25.237071  
##  X11.somite.stage    X12.somite.stage   X13.somite.stage   
##  Min.   :-29.42513   Min.   :-3.40204   Min.   :-27.73480  
##  1st Qu.:  0.01465   1st Qu.:-0.04845   1st Qu.:  0.03281  
##  Median :  0.02647   Median :-0.03579   Median :  0.05439  
##  Mean   :  0.00000   Mean   : 0.00000   Mean   :  0.00000  
##  3rd Qu.:  0.03829   3rd Qu.:-0.02314   3rd Qu.:  0.07597  
##  Max.   :  1.49196   Max.   :28.31153   Max.   :  3.33386  
##  X14.somite.stage   X15.somite.stage    X16.somite.stage   
##  Min.   :-1.71733   Min.   :-28.82780   Min.   :-17.98471  
##  1st Qu.:-0.02644   1st Qu.:  0.00643   1st Qu.: -0.01938  
##  Median :-0.01854   Median :  0.02580   Median :  0.01484  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.:-0.01064   3rd Qu.:  0.02926   3rd Qu.:  0.08328  
##  Max.   :29.42984   Max.   :  2.40006   Max.   : 10.45183  
##  X17.somite.stage   X18.somite.stage   X19.somite.stage   
##  Min.   :-0.27269   Min.   :-0.39904   Min.   :-29.61108  
##  1st Qu.:-0.03293   1st Qu.:-0.03368   1st Qu.:  0.03073  
##  Median :-0.03155   Median :-0.03183   Median :  0.03197  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   :  0.00000  
##  3rd Qu.:-0.03114   3rd Qu.:-0.03146   3rd Qu.:  0.03445  
##  Max.   :29.61223   Max.   :29.60908   Max.   :  0.32726  
##  X20.somite.stage    X21.somite.stage    X22.somite.stage  
##  Min.   :-29.60551   Min.   :-29.61236   Min.   :-5.92649  
##  1st Qu.:  0.03019   1st Qu.:  0.03135   1st Qu.:-0.04353  
##  Median :  0.03220   Median :  0.03165   Median :-0.02672  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.03587   3rd Qu.:  0.03240   3rd Qu.: 0.04051  
##  Max.   :  0.52720   Max.   :  0.23259   Max.   :19.59434  
##  X23.somite.stage   X24.somite.stage    X25.somite.stage   
##  Min.   :-0.45994   Min.   :-29.58460   Min.   :-23.31516  
##  1st Qu.:-0.03350   1st Qu.:  0.03034   1st Qu.:  0.03282  
##  Median :-0.03050   Median :  0.03333   Median :  0.03545  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.:-0.03019   3rd Qu.:  0.03632   3rd Qu.:  0.06052  
##  Max.   :29.60146   Max.   :  0.85954   Max.   :  3.92568  
##  X26.somite.stage   X27.somite.stage   X28.somite.stage   
##  Min.   :-8.62408   Min.   :-0.93022   Min.   :-29.58763  
##  1st Qu.:-0.06197   1st Qu.:-0.03107   1st Qu.:  0.03130  
##  Median :-0.02467   Median :-0.02951   Median :  0.03190  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   :  0.00000  
##  3rd Qu.:-0.02280   3rd Qu.:-0.02920   3rd Qu.:  0.03581  
##  Max.   :26.59436   Max.   :29.58166   Max.   :  0.59537  
##  X34.somite.stage    X35.somite.stage    X36.somite.stage  
##  Min.   :-8.937189   Min.   :-29.55033   Min.   :-2.59727  
##  1st Qu.:-0.021046   1st Qu.:  0.02575   1st Qu.:-0.03433  
##  Median :-0.013206   Median :  0.03136   Median :-0.02974  
##  Mean   : 0.000000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.000336   3rd Qu.:  0.03697   3rd Qu.:-0.02516  
##  Max.   :26.057424   Max.   :  1.35608   Max.   :29.42804

# PCA
pc1 = prcomp(x = my_data)
plot(pc1)

summary(pc1)

## Importance of components:
##                           PC1     PC2    PC3     PC4     PC5     PC6
## Standard deviation     4.8237 1.28899 1.0405 0.63766 0.47922 0.40333
## Proportion of Variance 0.8618 0.06154 0.0401 0.01506 0.00851 0.00603
## Cumulative Proportion  0.8618 0.92331 0.9634 0.97848 0.98698 0.99301
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.26813 0.20968 0.15442 0.14006 0.11404 0.07181
## Proportion of Variance 0.00266 0.00163 0.00088 0.00073 0.00048 0.00019
## Cumulative Proportion  0.99567 0.99730 0.99818 0.99891 0.99939 0.99958
##                           PC13    PC14    PC15    PC16    PC17  PC18
## Standard deviation     0.06383 0.05072 0.04011 0.03356 0.02760 2e-02
## Proportion of Variance 0.00015 0.00010 0.00006 0.00004 0.00003 1e-05
## Cumulative Proportion  0.99973 0.99983 0.99989 0.99993 0.99996 1e+00
##                           PC19    PC20    PC21     PC22     PC23     PC24
## Standard deviation     0.01723 0.01563 0.01179 0.008018 0.005331 0.004428
## Proportion of Variance 0.00001 0.00001 0.00001 0.000000 0.000000 0.000000
## Cumulative Proportion  0.99998 0.99999 1.00000 1.000000 1.000000 1.000000
##                            PC25     PC26      PC27
## Standard deviation     0.003313 0.002636 0.0004946
## Proportion of Variance 0.000000 0.000000 0.0000000
## Cumulative Proportion  1.000000 1.000000 1.0000000

pc1$x = pc1$x[, 1:5] ### ONLY USE 3 COMPONENTS

library(ggplot2)
ggplot(data = as.data.frame(pc1$x), aes(x = PC1, y = PC2, color = gnames)) +
    geom_point(show.legend = FALSE)

# t-SNE
library(tsne)
colors1 = rainbow(length(unique(rownames(r4))))
names(colors1) = unique(rownames(r4))
ecb = function(x, y) {
    plot(x, t = 'p', pch = '.', col = colors1[rownames(r4)])
    #text(x, labels = r3$Gene.Name, col = colors1[r3$Gene.Name])
}
tsne1 = tsne(
    my_data,
    epoch_callback = ecb,
    max_iter = 500,
    epoch = 100,
    k = 5
) # ONLE 3 DIMENTIONS

## sigma summary: Min. : 0.120373928043866 |1st Qu. : 0.193039226659211 |Median : 0.253734620737499 |Mean : 0.280924845311122 |3rd Qu. : 0.329001567772986 |Max. : 0.971707581501539 |

## Epoch: Iteration #100 error is: 17.3388624231855

## Epoch: Iteration #200 error is: 1.12232892551054

## Epoch: Iteration #300 error is: 1.09224299258535

## Epoch: Iteration #400 error is: 1.08307831877043

## Epoch: Iteration #500 error is: 1.08005979439315

gc()

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1168257 62.4    2164898 115.7  1770749  94.6
## Vcells 3133605 24.0   10811558  82.5 13514448 103.2

# Cluster tendency measures
# Hopkins statistic: If the value of Hopkins statistic is close to zero (far below 0.5), then we can conclude that the dataset is significantly clusterable
gradient.color <- list(low = "steelblue",  high = "white")
# Normal data
#my_data %>%
get_clust_tendency(my_data,
                                     n = 25,
                                     gradient = gradient.color,
                                     graph = FALSE)

## $hopkins_stat
## [1] 0.007535291
## 
## $plot
## NULL

# PCA
#pc1$x %>%
get_clust_tendency(pc1$x,
                                     n = 25,
                                     gradient = gradient.color,
                                     graph = FALSE)

## $hopkins_stat
## [1] 0.004613762
## 
## $plot
## NULL

# SNE
#tsne1 %>%
get_clust_tendency(tsne1,
                                     n = 25,
                                     gradient = gradient.color,
                                     graph = FALSE)

## $hopkins_stat
## [1] 0.07995601
## 
## $plot
## NULL

#####
# b = svm(x = tsne1, y =  iris$Species)
# colors = rainbow(length(unique(fitted(b))))
# names(colors) = unique(fitted(b))
# plot(tsne1, col = colors[fitted(b)],pch='*')
# points(tsne1,col= colors1[iris$Species],pch='+')
#####


# Correlation plots
library("cluster")
library("factoextra")
library("magrittr")

res.dist <- get_dist(my_data, stand = FALSE, method = "pearson")
fviz_dist(
    res.dist,
    gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"),
    show_labels = FALSE

)

# PCA
res.dist.pca <- get_dist(pc1$x, stand = FALSE, method = "pearson")
fviz_dist(
    res.dist.pca,
    gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"),
    show_labels = FALSE

)

# SNE
res.dist.sne <- get_dist(tsne1, stand = FALSE, method = "pearson")
fviz_dist(
    res.dist.sne,
    gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"),
    show_labels = FALSE
)

gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1213950  64.9    2164898 115.7  2164898 115.7
## Vcells 47377600 361.5   83391293 636.3 83267929 635.3

# Number of cluster estimation
library("factoextra")
k1 = fviz_nbclust(my_data, kmeans, method = "gap_stat")
plot(k1)

k2 = fviz_nbclust(my_data, kmeans, method = "silhouette")
plot(k2)

k3 = fviz_nbclust(my_data, kmeans, method = "wss")
plot(k3)

#pkgs <- c("factoextra",  "NbClust")
#install.packages(pkgs)
library(factoextra)
library(NbClust)
# method
# the cluster analysis method to be used. This should be one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans".
# index
# the index to be calculated. This should be one of : "kl", "ch", "hartigan", "ccc", "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", "hubert", "sdindex", "dindex", "sdbw", "all" (all indices except GAP, Gamma, Gplus and Tau), "alllong" (all indices with Gap, Gamma, Gplus and Tau included).
nb <- NbClust(
    my_data,
    distance = "manhattan",
    min.nc = 2,
    max.nc = 12,
    method = "kmeans"
)

## Warning in pf(beale, pp, df2): NaNs produced

## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 7 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 11 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

# Visualize
library(factoextra)
fviz_nbclust(nb, ggtheme = theme_minimal(),main='Actual data')

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 11 proposed  2 as the best number of clusters
## * 7 proposed  3 as the best number of clusters
## * 3 proposed  4 as the best number of clusters
## * 1 proposed  5 as the best number of clusters
## * 1 proposed  9 as the best number of clusters
## * 1 proposed  11 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

# PCA
nb.pca <- NbClust(
    pc1$x,
    distance = "manhattan",
    min.nc = 2,
    max.nc = 12,
    method = "kmeans"
)

## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 9 proposed 2 as the best number of clusters 
## * 5 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

# Visualize
library(factoextra)
fviz_nbclust(nb.pca, ggtheme = theme_minimal(),main='PCA')

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 9 proposed  2 as the best number of clusters
## * 5 proposed  3 as the best number of clusters
## * 3 proposed  4 as the best number of clusters
## * 1 proposed  5 as the best number of clusters
## * 1 proposed  8 as the best number of clusters
## * 2 proposed  9 as the best number of clusters
## * 3 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

# SNE
nb.sne <- NbClust(
    tsne1,
    distance = "manhattan",
    min.nc = 2,
    max.nc = 12,
    method = "kmeans"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 5 proposed 6 as the best number of clusters 
## * 3 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 2 proposed 11 as the best number of clusters 
## * 2 proposed 12 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

# Visualize
library(factoextra)
fviz_nbclust(nb.sne, ggtheme = theme_minimal(),main='SNE')

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 6 proposed  2 as the best number of clusters
## * 3 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 5 proposed  6 as the best number of clusters
## * 3 proposed  8 as the best number of clusters
## * 1 proposed  10 as the best number of clusters
## * 2 proposed  11 as the best number of clusters
## * 2 proposed  12 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

# Kmeans
set.seed(123)
k = 5 # Maximum number of clusters
km.res <- kmeans(my_data, k, nstart = 50)
# Visualize
library("factoextra")
fviz_cluster(
    km.res,
    data = my_data,
    ellipse.type = "convex",
    palette = "jco",
    geom = 'point',
    ggtheme = theme_minimal(),
    main = 'Actual data'
)

### PCA kmean
km.res.pca <- kmeans(pc1$x, k, nstart = 50)
# Visualize
library("factoextra")
fviz_cluster(
    km.res.pca,
    data = pc1$x,
    ellipse.type = "convex",
    palette = "jco",
    geom = 'point',
    ggtheme = theme_minimal(),
    main = 'PCA'
)

### SNE kmean
km.res.sne <- kmeans(tsne1, k, nstart = 50)
# Visualize
library("factoextra")
fviz_cluster(
    km.res.sne,
    data = tsne1,
    ellipse.type = "convex",
    palette = "jco",
    geom = 'point',
    ggtheme = theme_minimal(),
    main='SNE'
)

###

# Compute PAM
library("cluster")
pam.res <- pam(my_data, k)
# Visualize
fviz_cluster(
    pam.res,
    ellipse.type = "convex",
    palette = "jco",
    geom = 'point',
    ggtheme = theme_minimal(),main='Actual'
)

# Compute PCA PAM
library("cluster")
pam.res.pca <- pam(pc1$x, k)
# Visualize
fviz_cluster(
    pam.res.pca,
    ellipse.type = "convex",
    palette = "jco",
    geom = 'point',
    ggtheme = theme_minimal(),main='PCA'
)

# Compute SNE PAM
library("cluster")
pam.res.sne <- pam(tsne1, k)
# Visualize
fviz_cluster(
    pam.res.sne,
    ellipse.type = "convex",
    palette = "jco",
    geom = 'point',
    ggtheme = theme_minimal(),main = 'SNE'
)

# Compute hierarchical clustering
res.hc <-
    hclust(dist(my_data, method = "euclidean") , method = "ward.D2")

# Visualize using factoextra
# Cut in k groups and color by groups
library(factoextra)
fviz_dend(
    res.hc,
    k = k,
    k_colors = "jco",
    type = "circular",
    show_labels = FALSE,
    rect = FALSE,
    main = 'ACTUAL data'
)

#plot(diana(my_data))
gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1642589  87.8    2637877 140.9  2637877 140.9
## Vcells 40427411 308.5   83391293 636.3 83391293 636.3

# Compute hierarchical clustering + PCA
res.hc.pca =    hclust(dist(pc1$x, method = "euclidean"), method = "ward.D2")
library(factoextra)
fviz_dend(
    res.hc.pca,
    k = k,
    k_colors = "jco",
    type = "circular",
    show_labels = FALSE,
    rect = FALSE,
    main = 'PCA'
)

#plot(diana(my_data))
gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1644832  87.9    2637877 140.9  2637877 140.9
## Vcells 40551064 309.4   83391293 636.3 83391293 636.3

# Compute hierarchical clustering + SNE
res.hc.sne = hclust(dist(tsne1, method = "euclidean"), method = "ward.D2")

library(factoextra)
fviz_dend(
    res.hc.sne,
    k = k,
    k_colors = "jco",
    type = "circular",
    show_labels = FALSE,
    rect = FALSE,
    main = 'SNE'
)

#plot(diana(my_data))
gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1649178  88.1    2637877 140.9  2637877 140.9
## Vcells 40631721 310.0   83391293 636.3 83391293 636.3

# # Compare models
# library(dendextend)
# # Compute distance matrix
# res.dist <- dist(my_data, method = "euclidean")
# # Compute 2 hierarchical clusterings
# hc1 <- hclust(res.dist, method = "average")
# hc2 <- hclust(res.dist, method = "ward.D2")
# # Create two dendrograms
# dend1 <- as.dendrogram (hc1)
# dend2 <- as.dendrogram (hc2)
# # Create a list to hold dendrograms
# dend_list <- dendlist(dend1, dend2)
# tanglegram(
#   dend1,
#   dend2,
#   highlight_distinct_edges = FALSE,
#   # Turn-off dashed lines
#   common_subtrees_color_lines = FALSE,
#   # Turn-off line colors
#   common_subtrees_color_branches = TRUE,
#   # Color common branches
#   main = paste("entanglement =", round(entanglement(dend_list), 2))
# )
# #########



# Cut at level 6
#hcut(my_data, 6, graph = TRUE)

#
# library(factoextra)
# fviz_silhouette(res.hc.sne)


# Hybrid kmean
library(factoextra)
res.hk <- hkmeans(my_data, k)
# Visualize the tree
fviz_dend(
    res.hk,
    palette = "jco",
    type = 'circular',
    rect = TRUE,
    rect_border = "jco",
    rect_fill = TRUE,
    show_labels = FALSE,
    main = 'Actual data'
)

# Visualize the hkmeans final clusters
fviz_cluster(
    res.hk,
    palette = "jco",
    repel = TRUE,
    geom = 'point',
    labelsize = 0,
    ggtheme = theme_classic(),
    main = 'Actual data'
)

# Hybrid kmean + PCA
library(factoextra)
res.hk.pca <- hkmeans(pc1$x, k)
fviz_dend(
    res.hk.pca,
    palette = "jco",
    type = 'circular',
    rect = TRUE,
    rect_border = "jco",
    rect_fill = TRUE,
    show_labels = FALSE,
    main = 'PCA'
)

fviz_cluster(
    res.hk.pca,
    palette = "jco",
    repel = TRUE,
    geom = 'point',
    labelsize = 0,
    ggtheme = theme_classic(),
    main = 'PCA'
)

# Hybrid kmean + SNE
library(factoextra)
res.hk.sne <- hkmeans(tsne1, k)
fviz_dend(
    res.hk.pca,
    palette = "jco",
    type = 'circular',
    rect = TRUE,
    rect_border = "jco",
    rect_fill = TRUE,
    show_labels = FALSE,
    main = 'SNE'
)

fviz_cluster(
    res.hk.pca,
    palette = "jco",
    repel = TRUE,
    geom = 'point',
    labelsize = 0,
    ggtheme = theme_classic(),
    main = 'SNE'
)

gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1714417  91.6    2637877 140.9  2637877 140.9
## Vcells 41011831 312.9   83391293 636.3 83391293 636.3

# Fuzzy clustering
library(cluster)
res.fanny = fanny(x = my_data, k = k)
fviz_cluster(
    res.fanny,
    ellipse.type = "norm",
    repel = FALSE,
    geom = 'point',
    palette = "jco",
    ggtheme = theme_minimal(),
    legend = "right",
    main = 'Actual data'
)

fviz_silhouette(
    res.fanny,
    palette = "jco",
    ggtheme = theme_minimal(),
    label = FALSE,
    main = 'Actual data'
)

##   cluster size ave.sil.width
## 1       1  502          0.50
## 2       2   92         -0.20
## 3       3  285         -0.64

# Fuzzy clustering + PCA
library(cluster)
res.fanny.pca = fanny(x = pc1$x, k = k)
fviz_cluster(
    res.fanny.pca,
    ellipse.type = "norm",
    repel = FALSE,
    geom = 'point',
    palette = "jco",
    ggtheme = theme_minimal(),
    legend = "right",
    main = 'PCA'
)

fviz_silhouette(
    res.fanny.pca,
    palette = "jco",
    ggtheme = theme_minimal(),
    label = FALSE,
    main = 'PCA'
)

##   cluster size ave.sil.width
## 1       1  481          0.58
## 2       2  158         -0.20
## 3       3  240         -0.66

# Fuzzy clustering + SNE
library(cluster)
res.fanny.sne = fanny(x = tsne1, k = k)

## Warning in fanny(x = tsne1, k = k): the memberships are all very close to
## 1/k. Maybe decrease 'memb.exp' ?

fviz_cluster(
    res.fanny.sne,
    ellipse.type = "norm",
    repel = FALSE,
    geom = 'point',
    palette = "jco",
    ggtheme = theme_minimal(),
    legend = "right",
    main = 'SNE'
)

## Too few points to calculate an ellipse

fviz_silhouette(
    res.fanny.sne,
    palette = "jco",
    ggtheme = theme_minimal(),
    label = FALSE,
    main = 'SNE'
)

##   cluster size ave.sil.width
## 1       1  864          0.06
## 2       2   13          0.32
## 3       3    2          0.61

# model based clustering
library("ggpubr")
ggscatter(as.data.frame(pc1$x), x = "PC1", y = "PC2") +
    geom_density2d() + # Add 2D density
    labs(title = 'PCA')

library("ggpubr")
ggscatter(as.data.frame(tsne1), x = "V1", y = "V2") +
    geom_density2d() + # Add 2D density
    labs(title = 'SNE')

# Model based Clustering
library("mclust")
mc <- Mclust(my_data)
summary(mc)

## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VEV (ellipsoidal, equal shape) model with 5 components:
## 
##  log.likelihood   n   df      BIC    ICL
##        78884.35 879 1925 144719.5 144714
## 
## Clustering table:
##   1   2   3   4   5 
## 295 292 186  90  16

#plot(mc,what = 'classification',pch='.')
library(factoextra)
# BIC values used for choosing the number of clusters
fviz_mclust(mc, "BIC", palette = "jco")

# Classification: plot showing the clustering
fviz_mclust(
    mc,
    "classification",
    geom = "point",
    pointsize = 1.5,
    palette = "jco",
    ggtheme = theme_minimal(),
    main = 'Actual data'
)

# Classification uncertainty
fviz_mclust(
    mc,
    "uncertainty",
    palette = "jco",
    ggtheme = theme_minimal(),
    main = 'Actual data'
)

# Model based Clustering + PCA
library("mclust")
mc.pca <- Mclust(pc1$x)
summary(mc.pca)

## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 7 components:
## 
##  log.likelihood   n df      BIC      ICL
##        5954.638 879 62 11488.99 11227.95
## 
## Clustering table:
##   1   2   3   4   5   6   7 
## 146  76  32 152 254 213   6

library(factoextra)
fviz_mclust(mc.pca, "BIC", palette = "jco", main = 'PCA')

fviz_mclust(
    mc.pca,
    "classification",
    geom = "point",
    pointsize = 1.5,
    palette = "jco",
    ggtheme = theme_minimal(),
    main = 'PCA'
)

fviz_mclust(
    mc.pca,
    "uncertainty",
    palette = "jco",
    ggtheme = theme_minimal(),
    main = 'PCA'
)

# Model based Clustering + SNE
library("mclust")
mc.sne <- Mclust(tsne1)
summary(mc.sne)

## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VEV (ellipsoidal, equal shape) model with 8 components:
## 
##  log.likelihood   n  df       BIC      ICL
##       -16539.76 879 139 -34021.77 -34185.5
## 
## Clustering table:
##   1   2   3   4   5   6   7   8 
##  91 125  47 166 107 107 183  53

library(factoextra)
# BIC values used for choosing the number of clusters
fviz_mclust(mc.sne, "BIC", palette = "jco", main = 'SNE')

# Classification: plot showing the clustering
fviz_mclust(
    mc.sne,
    "classification",
    geom = "point",
    pointsize = 1.5,
    palette = "jco",
    main = 'SNE'
)

# Classification uncertainty
fviz_mclust(mc.sne, "uncertainty", palette = "jco", main = 'SNE')

gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2200663 117.6    3886542 207.6  3205452 171.2
## Vcells 42393654 323.5   83391293 636.3 83391293 636.3

# DBSCAN
# install.packages("fpc")
# install.packages("dbscan")
# install.packages("factoextra")
library(factoextra)
set.seed(123)
# km.res <- kmeans(my_data, 5, nstart = 25)
# fviz_cluster(
#   km.res,
#   df,
#   geom = "point",
#   ellipse = FALSE,
#   show.clust.cent = FALSE,
#   palette = "jco",
#   ggtheme = theme_classic()
# )

library("fpc")
set.seed(123)
db <- fpc::dbscan(my_data, eps = 10 ^ -1, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(
    db,
    data = my_data,
    stand = FALSE,
    ellipse = FALSE,
    show.clust.cent = FALSE,
    geom = "point",
    palette = "jco",
    ggtheme = theme_classic(),
    main = 'Actual data'
)

print(db)

## dbscan Pts=879 MinPts=5 eps=0.1
##          0   1
## border 386  36
## seed     0 457
## total  386 493

# DBSCAN+PCA
library("fpc")
set.seed(123)
db.pca <- fpc::dbscan(pc1$x, eps = 10 ^ -1, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(
    db.pca,
    data = pc1$x,
    stand = FALSE,
    ellipse = FALSE,
    show.clust.cent = FALSE,
    geom = "point",
    palette = "jco",
    ggtheme = theme_classic(),
    main = 'PCA'
)

print(db.pca)

## dbscan Pts=879 MinPts=5 eps=0.1
##          0   1
## border 229  39
## seed     0 611
## total  229 650

# DBSCAN+SNE
library("fpc")
set.seed(123)
db.sne <- fpc::dbscan(tsne1, eps = 10 ^ -1, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
fviz_cluster(
    db.sne,
    data = tsne1,
    stand = FALSE,
    ellipse = FALSE,
    show.clust.cent = TRUE,
    geom = "point",
    palette = "jco",
    ggtheme = theme_minimal(),
    main = 'SNE'
)

print(db.sne)

## dbscan Pts=879 MinPts=5 eps=0.1
## 
##   0 
## 879

R_code.R

hamedhm

Wed Jan 17 13:52:36 2018