Dendogramas
# Clústers Jerárquicos
hc <- hclust(dist(express))
ag<-agnes(dist(express))
di<-diana(dist(express))
#Hierarchical
ggdendrogram(hc, rotate = FALSE, labels = FALSE, theme_dendro = TRUE)+
labs(title="Dendograma Hierarchical Clustering")

fviz_dend(x = hc, k = 2, cex = 0.8) +
labs(title = "Herarchical clustering",
subtitle = "Distancia euclídea, Lincage complete, K=2")

fviz_dend(x = hc,
k = 2,
k_colors = c("#FF3300","#3399FF"),
color_labels_by_k = TRUE,
cex = 0.8,
type = "phylogenic",
repel = TRUE)

#Dendograma normal
fviz_dend(x = hc, k = 2, cex = 0.6) +
labs(title = "Herarchical clustering",
subtitle = "Distancia euclídea, Lincage complete, K=2")

#Dendograma circular
fviz_dend(x = hc,
k = 4,
k_colors = c("#2E9FDF", "#FC4E07"),
color_labels_by_k = TRUE,
cex = 0.5,
type = "circular")
## Warning in get_col(col, k): Length of color vector was shorter than the
## number of clusters - color vector was recycled

#Dendograma
fviz_dend(x = hc,
k = 2,
k_colors = c("#2E9FDF","#3399FF"),
color_labels_by_k = TRUE,
cex = 0.8,
type = "phylogenic",
repel = TRUE)

#Agnes
ggdendrogram(ag, rotate = FALSE, labels = FALSE, theme_dendro = TRUE)+
labs(title="Dendograma Agnes Clustering")
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used

fviz_dend(x = ag, k = 2, cex = 0.6) +
labs(title = "Agnes clustering",
subtitle = "Distancia euclídea, Lincage complete, K=2")

#Diana
ggdendrogram(di, rotate = FALSE, labels = FALSE, theme_dendro = TRUE)+
labs(title="Dendograma Diana Clustering")
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used
## Warning in if (dataClass %in% c("dendrogram", "hclust")) {: the condition
## has length > 1 and only the first element will be used

fviz_dend(x = di, k = 2, cex = 0.6) +
labs(title = "Diana clustering",
subtitle = "Distancia euclídea, Lincage complete, K=2")

Clústers no Jerárquicos
#Kmeans
km<-kmeans(express,centers = 2, nstart = 50)
fviz_cluster(object = km, data = express, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE) +
labs(title = "Resultados clustering K-means") +
theme_bw() +
theme(legend.position = "none")

#Pam
pam_clusters <- pam(express, k = 2, metric = "manhattan")
fviz_cluster(object = pam_clusters, data = datos, ellipse.type = "t",
repel = TRUE) +
theme_bw() +
labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")

#Clara
cl<-clara(express,k = 2, metric = "manhattan", stand = TRUE,
samples = 50, pamLike = TRUE)
fviz_cluster(object = cl, data = express, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE) +
labs(title = "Resultados clustering CLARA") +
theme_bw() +
theme(legend.position = "none")

Indices Internos
clases_aj <- cutree(hc, k = 2)
express$cluster <- clases_aj
ggplot() + geom_point(aes(x = express$M1, y = express$NC3, color = cluster), data = express, size = 2) +
scale_colour_gradientn(colours=rainbow(3)) +
ggtitle('Clusters de Datos con k = 2 / Agrupamiento Jerárquico') +
xlab('X') + ylab('Y')

metodos<-c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")
## hierarchical clustering
Dist <- dist(express,method="maximum") # Matriz de distancias
clusterObj <- hclust(Dist, method="average") # Método Cluster hierarchical
nc <- 2 ## number of clusters
cluster <- cutree(clusterObj,nc)
connectivity(Dist, cluster) # Indice interno connectivity
## [1] 4.565873
dunn(Dist, cluster) # Indice interno dunn
## [1] 0.4178642
si<-silhouette(cluster,Dist)
t<-summary(si)
t$avg.width # Indice interno silhouette
## [1] 0.5302684
Índices BiolÓgicos
## functional classes predetermined
fc <- tapply(rownames(express),mouse$FC[1:25], c)
fc <- fc[-match( c("EST","Unknown"), names(fc))]
fc <- annotationListToMatrix(fc, rownames(express))
bsi <- numeric(ncol(express))
## Need loop over all removed samples
for (del in 1:ncol(express)) {
matDel <- express[,-del]
DistDel <- dist(matDel,method="manhattan")
clusterObjDel <- agnes(DistDel, method="average")
clusterDel <- cutree(clusterObjDel,nc)
bsi[del] <- BSI(cluster, clusterDel, fc)
}
mean(bsi) # Índice Biológico de Estabilidad
## [1] 0.5952381
## functional classes predetermined
fc <- tapply(rownames(express),mouse$FC[1:25], c)
fc <- fc[-match( c("EST","Unknown"), names(fc))]
fc <- annotationListToMatrix(fc, rownames(express))
BHI(cluster, fc) # Índice Biológico de Homogeneidad
## [1] 0.1552288
# Metodos a utilizar
clmethods <- c("hierarchical","kmeans","agnes","diana","pam","clara")
# Calculo de los índices biológicos
fc <- tapply(rownames(express),mouse$FC[1:25], c)
fc <- fc[-match( c("EST","Unknown"), names(fc))]
bsi<-data.frame("hierarchical"=0,"kmeans"=0,"agnes"=0,"diana"=0,"pam"=0,"clara"=0)
bhi<-data.frame("hierarchical"=0,"kmeans"=0,"agnes"=0,"diana"=0,"pam"=0,"clara"=0)
for (j in 1:6) {
for (i in 1:24) {
bio <- clValid(express, i, clMethods=clmethods[j],
validation="biological", annotation=fc,metric = "euclidean")
op<-optimalScores(bio) # Valores óptimos para los índices biológicos
bhi[i,j]<-op[1,1]
bsi[i,j]<-op[2,1]
}
}
bhi<-bhi[1:19,]
bsi<-bsi[1:20,]
#Gráficos de tendencia----------------------------
library(ggplot2)
todo<-c(bhi$hierarchical,bhi$kmeans,bhi$agnes,bhi$diana,bhi$pam,bhi$clara)
nom<-c()
nom[1:19]<-replicate(19,clmethods[1])
nom[20:38]<-replicate(19,clmethods[2])
nom[39:57]<-replicate(19,clmethods[3])
nom[58:76]<-replicate(19,clmethods[4])
nom[77:95]<-replicate(19,clmethods[5])
nom[96:114]<-replicate(19,clmethods[6])
i[1:114]<-1:19
df<-data.frame(id=i,BHI=todo,"Métodos"=nom)
table(df$Métodos)
##
## agnes clara diana hierarchical kmeans
## 19 19 19 19 19
## pam
## 19
ggplot(df,aes(x=df$id,y=df$BHI,group=df$Métodos,colour=Métodos))+geom_line(size=1.4)+
geom_point(size=2)+ylab("Valores BHI")+xlab("Nº de Clusters")+
ggtitle("Índice Biológico de Homogeneidad")+
theme_update()+scale_x_continuous(breaks=seq(0, 19, 1 ))

todo1<-c(bsi$hierarchical,bsi$kmeans,bsi$agnes,bsi$diana,bsi$pam,bsi$clara)
nom<-c()
nom[1:20]<-replicate(20,clmethods[1])
nom[21:40]<-replicate(20,clmethods[2])
nom[41:60]<-replicate(20,clmethods[3])
nom[61:80]<-replicate(20,clmethods[4])
nom[81:100]<-replicate(20,clmethods[5])
nom[101:120]<-replicate(20,clmethods[6])
i[1:120]<-1:20
df<-data.frame(id=i,BSI=todo1,"Métodos"=nom)
table(df$Métodos)
##
## agnes clara diana hierarchical kmeans
## 20 20 20 20 20
## pam
## 20
ggplot(df,aes(x=df$id,y=df$BSI,group=df$Métodos,colour=Métodos))+geom_line(size=1.4)+
geom_point(size=2)+ylab("Valores BSI")+xlab("Nº de Clusters")+
ggtitle("Índice Biológico de Estabilidad")+
theme_update()+scale_x_continuous(breaks=seq(0, 19, 1 ))
