capitulo 4:k-means clustering (agrupamiento de significancia K)
# Adjuntar los datos de arrestos de Estados unidos, escalar los datos y visualizar filas
data("USArrests")
df <- scale(USArrests)
head(df, n = 3)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
# Adjuntar librerias y visualizar los cluster
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(df, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(title = "Número óptimo de Clusters - Método del Codo", x = "Número de Clusters", y = "Suma de Cuadrados Totales")

set.seed(123)
km.res <- kmeans(df, 4, nstart = 25)
print(km.res)
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Determinar la media de las variables
aggregate(USArrests, by = list(cluster = km.res$cluster), mean)
## cluster Murder Assault UrbanPop Rape
## 1 1 13.93750 243.62500 53.75000 21.41250
## 2 2 3.60000 78.53846 52.07692 12.17692
## 3 3 5.65625 138.87500 73.87500 18.78125
## 4 4 10.81538 257.38462 76.00000 33.19231
# Mostrar el cluster para cada observacion
dd <- cbind(USArrests, cluster = km.res$cluster)
head(dd)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 4
## Arizona 8.1 294 80 31.0 4
## Arkansas 8.8 190 50 19.5 1
## California 9.0 276 91 40.6 4
## Colorado 7.9 204 78 38.7 4
# Determinar el numero, el valor del vector, el tamaƱo y media de los clusters en la observaciones
km.res$cluster
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
head(km.res$cluster, 4)
## Alabama Alaska Arizona Arkansas
## 1 4 4 1
km.res$size
## [1] 8 13 16 13
km.res$centers
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
# Visualizar los clusters
fviz_cluster(km.res, data = df,
palette = c("#FF0000", "#0000FF", "#000000", "#C870FF"),
ellipse.type = "euclid", # Elipse de concentración
star.plot = TRUE, # AƱadir segmentos desde los centroides hasta los elementos
repel = TRUE, # Evitar sobreposiciones en las etiquetas (mƔs lento)
ggtheme = theme_minimal()) +
labs(title = "Visualización de Clusters K-Means")

Capitulo 5: K-medoids
library("cluster")
# Estimar numero de cluster por el metodo silhouette
fviz_nbclust(df, pam, method = "silhouette") +
theme_bw() +
labs(title = "Selección del NĆŗmero Ćptimo de Clusters")

pam.res <- pam(df, 2)
print(pam.res)
## Medoids:
## ID Murder Assault UrbanPop Rape
## New Mexico 31 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska 27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
## Objective function:
## build swap
## 1.441358 1.368969
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
# Vizualisar medoids y grupos y aƱadir el numero de cluster a las obsevaciones
pam.res$medoids
## Murder Assault UrbanPop Rape
## New Mexico 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska -0.8008247 -0.8250772 -0.2445636 -0.5052109
head(pam.res$clustering)
## Alabama Alaska Arizona Arkansas California Colorado
## 1 1 1 2 1 1
dd <- cbind(USArrests, cluster = pam.res$cluster)
head(dd)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 1
## Arizona 8.1 294 80 31.0 1
## Arkansas 8.8 190 50 19.5 2
## California 9.0 276 91 40.6 1
## Colorado 7.9 204 78 38.7 1
# Visualizar de los clusters
fviz_cluster(pam.res, palette = c("#FF6347", "#87CEEB"), ellipse.type = "t", repel = TRUE) +
theme_minimal() +
labs(title = "Visualización de los Clusters")
