Agrupación de genes en Ciencia Intensiva: comparación y análisis de tendencia mediante el índice de estabilidad biológica.

# carga de los datos y librerías------------------------------------------------------
library(factoextra)
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggdendro)
library(ggplot2)
library(clValid)
## Loading required package: cluster
## 
## Attaching package: 'clValid'
## The following object is masked from 'package:igraph':
## 
##     clusters
data(mouse)
attach(mouse)
express <- mouse[1:25,c("M1","M2","M3","NC1","NC2","NC3")] #Se separan las variables
rownames(express) <- mouse$ID[1:25] #Se nombra las variables

Dendogramas

# Clústers Jerárquicos 
hc <- hclust(dist(express))  
ag<-agnes(dist(express))
di<-diana(dist(express))
#Hierarchical
ggdendrogram(hc, rotate = FALSE, labels = FALSE, theme_dendro = TRUE)+
  labs(title="Dendograma Hierarchical Clustering")

fviz_dend(x = hc, k = 2, cex = 0.8) +
  labs(title = "Herarchical clustering",
       subtitle = "Distancia euclídea, Lincage complete, K=2")

fviz_dend(x = hc,
          k = 2,
          k_colors = c("#FF3300","#3399FF"),
          color_labels_by_k = TRUE,
          cex = 0.8,
          type = "phylogenic",
          repel = TRUE)

#Dendograma normal
fviz_dend(x = hc, k = 2, cex = 0.6) +
  labs(title = "Herarchical clustering",
       subtitle = "Distancia euclídea, Lincage complete, K=2")

#Dendograma circular
fviz_dend(x = hc,
          k = 4,
          k_colors = c("#2E9FDF", "#FC4E07"),
          color_labels_by_k = TRUE,
          cex = 0.5,
          type = "circular")
## Warning in get_col(col, k): Length of color vector was shorter than the
## number of clusters - color vector was recycled

#Dendograma 
fviz_dend(x = hc,
          k = 2,
          k_colors = c("#2E9FDF","#3399FF"),
          color_labels_by_k = TRUE,
          cex = 0.8,
          type = "phylogenic",
          repel = TRUE)

#Agnes
ggdendrogram(ag, rotate = FALSE, labels = FALSE, theme_dendro = TRUE)+
  labs(title="Dendograma Agnes Clustering") 
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used

fviz_dend(x = ag, k = 2, cex = 0.6) +
  labs(title = "Agnes clustering",
       subtitle = "Distancia euclídea, Lincage complete, K=2")

#Diana
ggdendrogram(di, rotate = FALSE, labels = FALSE, theme_dendro = TRUE)+
  labs(title="Dendograma Diana Clustering")
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used

## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used

fviz_dend(x = di, k = 2, cex = 0.6) +
  labs(title = "Diana clustering",
       subtitle = "Distancia euclídea, Lincage complete, K=2")

Clústers no Jerárquicos

#Kmeans
km<-kmeans(express,centers = 2, nstart = 50)
fviz_cluster(object = km, data = express, show.clust.cent = TRUE,
             ellipse.type = "euclid", star.plot = TRUE, repel = TRUE) +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +
  theme(legend.position = "none")

#Pam
pam_clusters <- pam(express, k = 2, metric = "manhattan")

fviz_cluster(object = pam_clusters, data = datos, ellipse.type = "t",
             repel = TRUE) +
  theme_bw() +
  labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

#Clara
cl<-clara(express,k = 2, metric = "manhattan", stand = TRUE,
          samples = 50, pamLike = TRUE)
fviz_cluster(object = cl, data = express, show.clust.cent = TRUE,
             ellipse.type = "euclid", star.plot = TRUE, repel = TRUE) +
  labs(title = "Resultados clustering CLARA") +
  theme_bw() +
  theme(legend.position = "none")

Indices Internos

clases_aj <- cutree(hc, k = 2)
express$cluster <- clases_aj

ggplot() + geom_point(aes(x = express$M1, y = express$NC3, color = cluster), data = express, size = 2) +
  scale_colour_gradientn(colours=rainbow(3)) +
  ggtitle('Clusters de Datos con k = 2 / Agrupamiento Jerárquico') + 
  xlab('X') + ylab('Y')

metodos<-c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")

## hierarchical clustering
Dist <- dist(express,method="maximum") # Matriz de distancias
clusterObj <- hclust(Dist, method="average") # Método Cluster hierarchical
nc <- 2 ## number of clusters      
cluster <- cutree(clusterObj,nc) 
connectivity(Dist, cluster) # Indice interno connectivity
## [1] 4.565873
dunn(Dist, cluster)   # Indice interno dunn
## [1] 0.4178642
si<-silhouette(cluster,Dist)
t<-summary(si)     
t$avg.width   # Indice interno silhouette
## [1] 0.5302684

Índices BiolÓgicos

## functional classes predetermined
fc <- tapply(rownames(express),mouse$FC[1:25], c)
fc <- fc[-match( c("EST","Unknown"), names(fc))]
fc <- annotationListToMatrix(fc, rownames(express))
bsi <- numeric(ncol(express))

## Need loop over all removed samples
for (del in 1:ncol(express)) {
  matDel <- express[,-del]               
  DistDel <- dist(matDel,method="manhattan")
  clusterObjDel <- agnes(DistDel, method="average")
  clusterDel <- cutree(clusterObjDel,nc)
  bsi[del] <- BSI(cluster, clusterDel, fc)
}
mean(bsi) # Índice Biológico de Estabilidad
## [1] 0.5952381
## functional classes predetermined
fc <- tapply(rownames(express),mouse$FC[1:25], c)
fc <- fc[-match( c("EST","Unknown"), names(fc))]
fc <- annotationListToMatrix(fc, rownames(express))
BHI(cluster, fc) # Índice Biológico de Homogeneidad
## [1] 0.1552288
# Metodos a utilizar
clmethods <- c("hierarchical","kmeans","agnes","diana","pam","clara")

# Calculo de los índices biológicos
fc <- tapply(rownames(express),mouse$FC[1:25], c)
fc <- fc[-match( c("EST","Unknown"), names(fc))]


bsi<-data.frame("hierarchical"=0,"kmeans"=0,"agnes"=0,"diana"=0,"pam"=0,"clara"=0)
bhi<-data.frame("hierarchical"=0,"kmeans"=0,"agnes"=0,"diana"=0,"pam"=0,"clara"=0)

for (j in 1:6) {
  for (i in 1:24) {
    bio <- clValid(express, i, clMethods=clmethods[j],
                   validation="biological", annotation=fc,metric = "euclidean")
    op<-optimalScores(bio) # Valores óptimos para los índices biológicos
    bhi[i,j]<-op[1,1]
    bsi[i,j]<-op[2,1]
  }
}

bhi<-bhi[1:19,]
bsi<-bsi[1:20,]

#Gráficos de tendencia----------------------------

library(ggplot2)


todo<-c(bhi$hierarchical,bhi$kmeans,bhi$agnes,bhi$diana,bhi$pam,bhi$clara)

nom<-c()
nom[1:19]<-replicate(19,clmethods[1])
nom[20:38]<-replicate(19,clmethods[2])
nom[39:57]<-replicate(19,clmethods[3])
nom[58:76]<-replicate(19,clmethods[4])
nom[77:95]<-replicate(19,clmethods[5])
nom[96:114]<-replicate(19,clmethods[6])

i[1:114]<-1:19
df<-data.frame(id=i,BHI=todo,"Métodos"=nom)
table(df$Métodos)
## 
##        agnes        clara        diana hierarchical       kmeans 
##           19           19           19           19           19 
##          pam 
##           19
ggplot(df,aes(x=df$id,y=df$BHI,group=df$Métodos,colour=Métodos))+geom_line(size=1.4)+
  geom_point(size=2)+ylab("Valores BHI")+xlab("Nº de Clusters")+
  ggtitle("Índice Biológico de Homogeneidad")+
  theme_update()+scale_x_continuous(breaks=seq(0, 19, 1 ))

todo1<-c(bsi$hierarchical,bsi$kmeans,bsi$agnes,bsi$diana,bsi$pam,bsi$clara)
nom<-c()
nom[1:20]<-replicate(20,clmethods[1])
nom[21:40]<-replicate(20,clmethods[2])
nom[41:60]<-replicate(20,clmethods[3])
nom[61:80]<-replicate(20,clmethods[4])
nom[81:100]<-replicate(20,clmethods[5])
nom[101:120]<-replicate(20,clmethods[6])

i[1:120]<-1:20
df<-data.frame(id=i,BSI=todo1,"Métodos"=nom)
table(df$Métodos)
## 
##        agnes        clara        diana hierarchical       kmeans 
##           20           20           20           20           20 
##          pam 
##           20
ggplot(df,aes(x=df$id,y=df$BSI,group=df$Métodos,colour=Métodos))+geom_line(size=1.4)+
  geom_point(size=2)+ylab("Valores BSI")+xlab("Nº de Clusters")+
  ggtitle("Índice Biológico de Estabilidad")+
  theme_update()+scale_x_continuous(breaks=seq(0, 19, 1 ))