Capitulo 6
#Formato y preparación de datos
set.seed(1234)
#500 objetos, divididos en 2 grupos
df <- rbind(cbind(rnorm(200,0,8), rnorm(200,0,8)),
cbind(rnorm(300,50,8), rnorm(300,50,8)))
# Se especifica nombres de columnas y filas
colnames(df) <- c("x", "y")
rownames(df) <- paste0("S", 1:nrow(df))
# Vista previa de los datos
head(df, nrow = 6)
## x y
## S1 -9.656526 3.881815
## S2 2.219434 5.574150
## S3 8.675529 1.484111
## S4 -18.765582 5.605868
## S5 3.432998 2.493448
## S6 4.048447 6.083699
#Estimar el número óptimo de conglomerados
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.2
library(factoextra)
## Cargando paquete requerido: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(df, clara, method = "silhouette")+
theme_classic()
# Calcular CLARA
clara.res <- clara(df, 2, samples = 50, pamLike = TRUE)
# Se muestra clara.res
print(clara.res)
## Call: clara(x = df, k = 2, samples = 50, pamLike = TRUE)
## Medoids:
## x y
## S121 -1.531137 1.145057
## S455 48.357304 50.233499
## Objective function: 9.87862
## Clustering vector: Named int [1:500] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "names")= chr [1:500] "S1" "S2" "S3" "S4" "S5" "S6" "S7" ...
## Cluster sizes: 200 300
## Best sample:
## [1] S37 S49 S54 S63 S68 S71 S76 S80 S82 S101 S103 S108 S109 S118 S121
## [16] S128 S132 S138 S144 S162 S203 S210 S216 S231 S234 S249 S260 S261 S286 S299
## [31] S304 S305 S312 S315 S322 S350 S403 S450 S454 S455 S456 S465 S488 S497
##
## Available components:
## [1] "sample" "medoids" "i.med" "clustering" "objective"
## [6] "clusinfo" "diss" "call" "silinfo" "data"
dd <- cbind(df, cluster = clara.res$cluster)
head(dd, n = 4)
## x y cluster
## S1 -9.656526 3.881815 1
## S2 2.219434 5.574150 1
## S3 8.675529 1.484111 1
## S4 -18.765582 5.605868 1
Visualizando clusters CLARA
fviz_cluster(clara.res,
palette = c("#00AFBB", "#FC4E07"), # colores
ellipse.type = "t",
geom = "point", pointsize = 1,
ggtheme = theme_classic()
)
Capitulo 7
Pasos para la Agrupación Jerárquica aglomerativa
# Se cargan los datos
data("USArrests")
# Se estandarizan los datos
df <- scale(USArrests)
# Se muestran los datos
head(df, nrow = 6)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# Se calcula la matriz
# df = datos estandarizados
res.dist <- dist(df, method = "euclidean")
as.matrix(res.dist)[1:6, 1:6]
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.000000 2.703754 2.293520 1.289810 3.263110 2.651067
## Alaska 2.703754 0.000000 2.700643 2.826039 3.012541 2.326519
## Arizona 2.293520 2.700643 0.000000 2.717758 1.310484 1.365031
## Arkansas 1.289810 2.826039 2.717758 0.000000 3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641 0.000000 1.287619
## Colorado 2.651067 2.326519 1.365031 2.831051 1.287619 0.000000
res.hc <- hclust(d = res.dist, method = "ward.D2")
Dendrograma
library("factoextra")
fviz_dend(res.hc, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
res.coph <- cophenetic(res.hc)
cor(res.dist, res.coph)
## [1] 0.6975266
res.hc2 <- hclust(res.dist, method = "average")
cor(res.dist, cophenetic(res.hc2))
## [1] 0.7180382
# Se divide en cuatro grupos
grp <- cutree(res.hc, k = 4)
head(grp, n = 4)
## Alabama Alaska Arizona Arkansas
## 1 2 2 3
# Numero de miembros en cada grupo
table(grp)
## grp
## 1 2 3 4
## 7 12 19 12
# Obtener los nombres de los miembros del grupo 1
rownames(df)[grp == 1]
## [1] "Alabama" "Georgia" "Louisiana" "Mississippi"
## [5] "North Carolina" "South Carolina" "Tennessee"
# Se divide en cuatro grupos y se separa por color
fviz_dend(res.hc, k = 4, # Se divide en cuatro grupos
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # colores de los grupos
rect = TRUE
)
fviz_cluster(list(data = df, cluster = grp),
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "convex",
repel = TRUE,
show.clust.cent = FALSE, ggtheme = theme_minimal())
library("cluster")
# Anidamiento aglomerativo (agrupación jerárquica)
res.agnes <- agnes(x = USArrests,
stand = TRUE, # datos estandarizados
metric = "euclidean",
method = "ward"
)
res.diana <- diana(x = USArrests,
stand = TRUE, # datos estandarizados
metric = "euclidean"
)
fviz_dend(res.agnes, cex = 0.6, k = 4)