A23

Capitulo 6

#Formato y preparación de datos

set.seed(1234)
#500 objetos, divididos en 2 grupos
df <- rbind(cbind(rnorm(200,0,8), rnorm(200,0,8)),
cbind(rnorm(300,50,8), rnorm(300,50,8)))

# Se especifica nombres de columnas y filas
colnames(df) <- c("x", "y")

rownames(df) <- paste0("S", 1:nrow(df))
# Vista previa de los datos
head(df, nrow = 6)

##             x        y
## S1  -9.656526 3.881815
## S2   2.219434 5.574150
## S3   8.675529 1.484111
## S4 -18.765582 5.605868
## S5   3.432998 2.493448
## S6   4.048447 6.083699

#Estimar el número óptimo de conglomerados

library(cluster)

## Warning: package 'cluster' was built under R version 4.4.2

library(factoextra)

## Cargando paquete requerido: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_nbclust(df, clara, method = "silhouette")+
theme_classic()

# Calcular CLARA
clara.res <- clara(df, 2, samples = 50, pamLike = TRUE)
# Se muestra clara.res
print(clara.res)

## Call:     clara(x = df, k = 2, samples = 50, pamLike = TRUE) 
## Medoids:
##              x         y
## S121 -1.531137  1.145057
## S455 48.357304 50.233499
## Objective function:   9.87862
## Clustering vector:    Named int [1:500] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "names")= chr [1:500] "S1" "S2" "S3" "S4" "S5" "S6" "S7" ...
## Cluster sizes:            200 300 
## Best sample:
##  [1] S37  S49  S54  S63  S68  S71  S76  S80  S82  S101 S103 S108 S109 S118 S121
## [16] S128 S132 S138 S144 S162 S203 S210 S216 S231 S234 S249 S260 S261 S286 S299
## [31] S304 S305 S312 S315 S322 S350 S403 S450 S454 S455 S456 S465 S488 S497
## 
## Available components:
##  [1] "sample"     "medoids"    "i.med"      "clustering" "objective" 
##  [6] "clusinfo"   "diss"       "call"       "silinfo"    "data"

dd <- cbind(df, cluster = clara.res$cluster)
head(dd, n = 4)

##             x        y cluster
## S1  -9.656526 3.881815       1
## S2   2.219434 5.574150       1
## S3   8.675529 1.484111       1
## S4 -18.765582 5.605868       1

Visualizando clusters CLARA

fviz_cluster(clara.res,

palette = c("#00AFBB", "#FC4E07"), # colores
ellipse.type = "t", 
geom = "point", pointsize = 1,
ggtheme = theme_classic()
)

Capitulo 7

Pasos para la Agrupación Jerárquica aglomerativa

# Se cargan los datos
data("USArrests")
# Se estandarizan los datos
df <- scale(USArrests)
# Se muestran los datos
head(df, nrow = 6)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

# Se calcula la matriz
# df = datos estandarizados
res.dist <- dist(df, method = "euclidean")
as.matrix(res.dist)[1:6, 1:6]

##             Alabama   Alaska  Arizona Arkansas California Colorado
## Alabama    0.000000 2.703754 2.293520 1.289810   3.263110 2.651067
## Alaska     2.703754 0.000000 2.700643 2.826039   3.012541 2.326519
## Arizona    2.293520 2.700643 0.000000 2.717758   1.310484 1.365031
## Arkansas   1.289810 2.826039 2.717758 0.000000   3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641   0.000000 1.287619
## Colorado   2.651067 2.326519 1.365031 2.831051   1.287619 0.000000

res.hc <- hclust(d = res.dist, method = "ward.D2")

Dendrograma

library("factoextra")
fviz_dend(res.hc, cex = 0.5)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

res.coph <- cophenetic(res.hc)
cor(res.dist, res.coph)

## [1] 0.6975266

res.hc2 <- hclust(res.dist, method = "average")
cor(res.dist, cophenetic(res.hc2))

## [1] 0.7180382

# Se divide en cuatro grupos
grp <- cutree(res.hc, k = 4)
head(grp, n = 4)

##  Alabama   Alaska  Arizona Arkansas 
##        1        2        2        3

# Numero de miembros en cada grupo
table(grp)

## grp
##  1  2  3  4 
##  7 12 19 12

# Obtener los nombres de los miembros del grupo 1
rownames(df)[grp == 1]

## [1] "Alabama"        "Georgia"        "Louisiana"      "Mississippi"   
## [5] "North Carolina" "South Carolina" "Tennessee"

# Se divide en cuatro grupos y se separa por color
fviz_dend(res.hc, k = 4, # Se divide en cuatro grupos

cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # colores de los grupos
rect = TRUE 
)

fviz_cluster(list(data = df, cluster = grp),

palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "convex",
repel = TRUE, 
show.clust.cent = FALSE, ggtheme = theme_minimal())

library("cluster")
# Anidamiento aglomerativo (agrupación jerárquica)
res.agnes <- agnes(x = USArrests,
stand = TRUE, # datos estandarizados
metric = "euclidean", 
method = "ward"
)

res.diana <- diana(x = USArrests,
stand = TRUE, # datos estandarizados
metric = "euclidean"
)
fviz_dend(res.agnes, cex = 0.6, k = 4)

A23

GABRIEL ARTURO SANCHEZ HENRIQUEZ

2024-11-18