Análisis de Clúster (Conglomerados)
UNIVERSIDAD DE EL SALVADOR
FACULTAD DE CIENCIAS ECONÓMICAS
ESCUELA DE ECONOMÍA
Ciclo II - 2025
“Tarea A32: Aplicación: Análisis de Clúster (Conglomerados )”
Asignatura: Métodos para el Análisis Económico
Grupo teórico: GT-01
Docente: MSF Carlos Ademir Pérez Alas
Integrantes:
Jaime Elías Barahona Martínez BM23012
Jefferson Josué Sandoval Pérez SP23029
Mariana Ivette Zelaya Castillo ZC23003
Ciudad Universitaria, San Salvador – 30 de noviembre de 2025
1. Carga de datos y preparación
# Cargar librerías necesarias
library(NbClust)
library(cluster)
library(factoextra)
library(ggplot2)
library(dplyr)
# Crear el dataframe con los datos del cuadro 3.22
Datos_3_3_Caso <- data.frame(
id = 1:18,
CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias",
"Cantabria", "Castilla y León", "Cast.-La Mancha", "Cataluña",
"Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
"Navarra", "País Vasco", "La Rioja"),
automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4,
72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1,
98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1,
68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8,
26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8,
12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2,
84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
# Preparar datos para el análisis (solo las variables numéricas)
datos_analisis <- Datos_3_3_Caso[, c("automovil", "tvcolor", "video",
"microondas", "lavavajillas", "telefono")]
rownames(datos_analisis) <- Datos_3_3_Caso$CCAA
print(Datos_3_3_Caso)## id CCAA automovil tvcolor video microondas lavavajillas telefono
## 1 1 España 69.0 97.6 62.4 32.3 17.0 85.2
## 2 2 Andalucía 66.7 98.0 82.7 24.1 12.7 74.7
## 3 3 Aragón 67.2 97.5 56.8 43.4 20.6 88.4
## 4 4 Asturias 63.7 95.2 52.1 24.4 13.3 88.1
## 5 5 Baleares 71.9 98.8 62.4 29.8 10.1 87.9
## 6 6 Canarias 72.7 96.8 68.4 27.9 5.8 75.4
## 7 7 Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
## 8 8 Castilla y León 65.8 97.1 47.7 28.1 14.0 85.0
## 9 9 Cast.-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
## 10 10 Cataluña 70.4 98.1 71.1 36.8 19.8 92.2
## 11 11 Com. Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
## 12 12 Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
## 13 13 Galicia 65.5 91.3 42.7 13.5 14.6 85.9
## 14 14 Madrid 74.0 99.4 76.3 53.9 32.3 95.7
## 15 15 Murcia 69.0 98.7 59.3 19.5 12.1 81.4
## 16 16 Navarra 76.4 99.3 60.6 44.0 20.6 87.4
## 17 17 País Vasco 71.3 98.3 61.6 45.7 23.7 94.3
## 18 18 La Rioja 64.9 98.6 54.4 44.4 17.6 83.4
2. Detección de outliers mediante distancia de Mahalanobis
# Calcular distancias de Mahalanobis
mahal_dist <- mahalanobis(datos_analisis,
colMeans(datos_analisis),
cov(datos_analisis))
# Calcular p-valores (chi-cuadrado con 6 grados de libertad)
p_values <- pchisq(mahal_dist, df = 6, lower.tail = FALSE)
# Crear tabla de resultados (Cuadro 3.23)
resultados_outliers <- data.frame(
CCAA = Datos_3_3_Caso$CCAA,
D2 = round(mahal_dist, 2),
p_value = round(p_values, 2)
)
print("Cuadro 3.23: Resultados de la detección de outliers")## [1] "Cuadro 3.23: Resultados de la detección de outliers"
## CCAA D2 p_value
## España España 0.20 1.00
## Andalucía Andalucía 10.52 0.10
## Aragón Aragón 1.91 0.93
## Asturias Asturias 4.46 0.61
## Baleares Baleares 5.70 0.46
## Canarias Canarias 9.58 0.14
## Cantabria Cantabria 7.29 0.29
## Castilla y León Castilla y León 2.21 0.90
## Cast.-La Mancha Cast.-La Mancha 3.54 0.74
## Cataluña Cataluña 2.95 0.82
## Com. Valenciana Com. Valenciana 2.65 0.85
## Extremadura Extremadura 10.43 0.11
## Galicia Galicia 13.24 0.04
## Madrid Madrid 8.31 0.22
## Murcia Murcia 4.88 0.56
## Navarra Navarra 7.65 0.26
## País Vasco País Vasco 2.32 0.89
## La Rioja La Rioja 4.17 0.65
3. Análisis de conglomerados jerárquicos
# Calcular matriz de distancias euclídeas
matriz.dis.euclid.caso3 <- dist(datos_analisis, method = "euclidean")
# Aplicar diferentes métodos de conglomeración
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3, method = "single")
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3, method = "centroid")
# Explorar estructura del resultado average
print("Estructura hclust.average.caso3:")## [1] "Estructura hclust.average.caso3:"
## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.339233 -1 3
## 5 9.387225 -16 -17
## 6 12.380548 1 5
## 7 12.448695 -9 -12
## 8 12.659790 -7 2
## 9 13.331971 -15 4
## 10 14.684330 -6 9
## 11 16.616569 -10 6
## 12 18.923865 -13 8
## 13 20.140313 7 12
## 14 22.109739 10 11
## 15 26.042489 13 14
## 16 31.452385 -2 15
## 17 39.104937 -14 16
# Visualizar dendogramas (Figura 3.13)
par(mfrow = c(2, 3))
plot(hclust.ward.caso3, main = "Método Ward", hang = -1, cex = 0.6)
plot(hclust.complete.caso3, main = "Método Complete", hang = -1, cex = 0.6)
plot(hclust.average.caso3, main = "Método Average", hang = -1, cex = 0.6)
plot(hclust.single.caso3, main = "Método Single", hang = -1, cex = 0.6)
plot(hclust.centroid.caso3, main = "Método Centroid", hang = -1, cex = 0.6)
par(mfrow = c(1, 1))# Colocados los mismos anteriores pero UNO POR UNO, para mejor visualización
plot(hclust.ward.caso3, main = "Método Ward", hang = -1, cex = 0.6)4. Determinación del número óptimo de clusters
# Preparar datos para NbClust
Datos.NbClust <- Datos_3_3_Caso[, c("automovil", "tvcolor", "video",
"microondas", "lavavajillas", "telefono")]
# Aplicar NbClust para diferentes métodos
set.seed(123)
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean",
min.nc = 2, max.nc = 15, method = "ward.D2", index = "all")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 6 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
res.complete <- NbClust(Datos.NbClust, distance = "euclidean",
min.nc = 2, max.nc = 15, method = "complete", index = "all")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 6 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
res.average <- NbClust(Datos.NbClust, distance = "euclidean",
min.nc = 2, max.nc = 15, method = "average", index = "all")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 3 proposed 13 as the best number of clusters
## * 6 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## [1] "Número óptimo de clusters según Ward.D2:"
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 3.0000 15.0000 3.0000 2.000 13.000 3.00000e+00 3.0
## Value_Index 3.0067 16.3636 5.0086 13.675 496.289 2.72309e+14 417427.7
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.00 1.500000e+01 3.0000 2.0000 15.0000 15.0000 2.0000
## Value_Index 820.35 1.237259e+16 -21.4006 0.3702 0.2904 0.7667 0.5504
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 3.0000 3.000 3.000 4.0000 1 2.0000
## Value_Index 8.1671 -2.3568 0.434 1038.568 0.5195 NA 0.6077
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 15.0000 0 7.0000 0 15.0000
## Value_Index 1.0795 0 0.1632 0 0.0087
## [1] "Número óptimo de clusters según Complete:"
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 3.0000 15.0000 3.0000 2.000 13.000 3.00000e+00 3.0
## Value_Index 3.0067 16.3636 5.0086 13.675 496.289 2.72309e+14 417427.7
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.00 1.500000e+01 3.0000 2.0000 15.0000 15.0000 2.0000
## Value_Index 820.35 1.237259e+16 -21.4006 0.3702 0.2904 0.7667 0.5504
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 3.0000 3.000 3.000 5.0000 1 2.0000
## Value_Index 8.1671 -2.3568 0.434 1038.568 0.5241 NA 0.6077
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 15.0000 0 7.0000 0 15.0000
## Value_Index 1.0795 0 0.1632 0 0.0087
## [1] "Número óptimo de clusters según Average:"
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 13.0000 15.0000 3.0000 2.0000 13.0000 4.000000e+00 3.0
## Value_Index 2.9006 16.3636 7.0163 12.5207 473.7819 9.861949e+14 330725.1
## TraceW Friedman Rubin Cindex DB Silhouette
## Number_clusters 4.0000 1.500000e+01 13.0000 3.0000 15.0000 15.0000
## Value_Index 694.2515 1.208339e+16 -218.5282 0.4862 0.2904 0.7667
## Duda PseudoT2 Beale Ratkowsky Ball PtBiserial Frey
## Number_clusters 2.0000 2.0000 2.0000 4.0000 3.000 4.0000 3.0000
## Value_Index 2.0294 -7.6087 -1.8296 0.3712 1057.447 0.5395 1.4791
## McClain Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.0000 15.0000 0 3.0000 0 15.0000
## Value_Index 0.0738 1.0795 0 0.1368 0 0.0087
5. Obtención de centroides del método jerárquico
library(kableExtra)
# Asignar clusters usando Ward (2 grupos)
grupo.ward <- cutree(hclust.ward.caso3, k = 2, h=NULL)
# Añadir clusters al dataframe
datos.caso3.grupos <- cbind(Datos_3_3_Caso, grupo.ward)
datos.caso3.grupos$id<-NULL
datos.caso3.grupos$CCAA<-NULL
round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3
head(datos.caso3)%>% kable(caption = "Grupos: Centroides resultantes del metodo jerarquico", align = "c",digits = 2) %>% kable_classic(html_font = "Montserrat", font_size = 14) %>% kable_styling()| Group.1 | automovil | tvcolor | video | microondas | lavavajillas | telefono | grupo.ward |
|---|---|---|---|---|---|---|---|
| 1 | 66.87 | 96.82 | 57.68 | 25.42 | 11.81 | 80.71 | 1 |
| 2 | 70.70 | 98.53 | 63.47 | 44.70 | 22.43 | 90.23 | 2 |
6. Aplicación del método k-means
library(stats)
datos.caso3.grupos.kmeans <- datos.caso3.grupos[, 1:6]
c1<-c(66.87,96.82,56.01 ,25.43,11.81,80.71)
c2<-c(70.70,98.53,63.47,44.70,22.43,90.23)
solucion <- kmeans(datos.caso3.grupos.kmeans,rbind(c1,c2))
solucion## K-means clustering with 2 clusters of sizes 12, 6
##
## Cluster means:
## automovil tvcolor video microondas lavavajillas telefono
## 1 66.86667 96.81667 57.67500 25.425 11.80833 80.70833
## 2 70.70000 98.53333 63.46667 44.700 22.43333 90.23333
##
## Clustering vector:
## España Andalucía Aragón Asturias Baleares
## 1 1 2 1 1
## Canarias Cantabria Castilla y León Cast.-La Mancha Cataluña
## 1 1 1 1 2
## Com. Valenciana Extremadura Galicia Madrid Murcia
## 1 1 1 2 1
## Navarra País Vasco La Rioja
## 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2810.6467 848.3533
## (between_SS / total_SS = 40.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
7.Visualización: Dendrograma método Ward (2 clusters)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")
plot(hclust.ward.caso3, labels = Datos_3_3_Caso$id)plot(hclust.ward.caso3, labels = Datos_3_3_Caso$CCAA)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "purple")8. Visualización: Dendrograma método Complete (2 clusters)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")
plot(hclust.ward.caso3, labels = Datos_3_3_Caso$id)plot(hclust.ward.caso3, labels = Datos_3_3_Caso$CCAA)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "red")9. Visualización: Dendrograma método Average (2 clusters)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
plot(hclust.ward.caso3, labels = Datos_3_3_Caso$id)plot(hclust.ward.caso3, labels = Datos_3_3_Caso$CCAA)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "blue")10. Pruebas t de diferencias entre grupos
# Primero asegurarnos de que tenemos el objeto correcto
# Añadir los clusters al dataframe
datos.caso3.grupos.kmeans$solucion.cluster <- solucion$cluster
# Verificar la estructura
print("Estructura del dataframe:")## [1] "Estructura del dataframe:"
## 'data.frame': 18 obs. of 7 variables:
## $ automovil : num 69 66.7 67.2 63.7 71.9 72.7 63.4 65.8 61.5 70.4 ...
## $ tvcolor : num 97.6 98 97.5 95.2 98.8 96.8 94.9 97.1 97.3 98.1 ...
## $ video : num 62.4 82.7 56.8 52.1 62.4 68.4 48.9 47.7 53.6 71.1 ...
## $ microondas : num 32.3 24.1 43.4 24.4 29.8 27.9 36.5 28.1 21.7 36.8 ...
## $ lavavajillas : num 17 12.7 20.6 13.3 10.1 5.8 11.2 14 7.1 19.8 ...
## $ telefono : num 85.2 74.7 88.4 88.1 87.9 75.4 80.5 85 72.9 92.2 ...
## $ solucion.cluster: int 1 1 2 1 1 1 1 1 1 2 ...
# Prueba t corregida para automóvil
t.test(automovil ~ solucion.cluster, data = datos.caso3.grupos.kmeans)##
## Welch Two Sample t-test
##
## data: automovil by solucion.cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -8.5449256 0.8782589
## sample estimates:
## mean in group 1 mean in group 2
## 66.86667 70.70000
11. Tabla de resultados de pruebas t
library(dplyr)
library(kableExtra)
# Preparar datos
Datos.NbClust$solucion.cluster <- solucion$cluster
# Calcular las pruebas t con nombres sin caracteres especiales
t1 <- t.test(automovil ~ solucion.cluster, data = Datos.NbClust)
t2 <- t.test(tvcolor ~ solucion.cluster, data = Datos.NbClust)
t3 <- t.test(video ~ solucion.cluster, data = Datos.NbClust)
t4 <- t.test(microondas ~ solucion.cluster, data = Datos.NbClust)
t5 <- t.test(lavavajillas ~ solucion.cluster, data = Datos.NbClust)
t6 <- t.test(telefono ~ solucion.cluster, data = Datos.NbClust)
# Crear dataframe con nombres SIN caracteres especiales
resultados.ttest <- data.frame(
Variable = c("Automovil", "TV color", "Video", "Microondas", "Lavavajillas", "Telefono"),
Grupo_1 = c(
mean(Datos.NbClust$automovil[Datos.NbClust$solucion.cluster == 1]),
mean(Datos.NbClust$tvcolor[Datos.NbClust$solucion.cluster == 1]),
mean(Datos.NbClust$video[Datos.NbClust$solucion.cluster == 1]),
mean(Datos.NbClust$microondas[Datos.NbClust$solucion.cluster == 1]),
mean(Datos.NbClust$lavavajillas[Datos.NbClust$solucion.cluster == 1]),
mean(Datos.NbClust$telefono[Datos.NbClust$solucion.cluster == 1])
),
Grupo_2 = c(
mean(Datos.NbClust$automovil[Datos.NbClust$solucion.cluster == 2]),
mean(Datos.NbClust$tvcolor[Datos.NbClust$solucion.cluster == 2]),
mean(Datos.NbClust$video[Datos.NbClust$solucion.cluster == 2]),
mean(Datos.NbClust$microondas[Datos.NbClust$solucion.cluster == 2]),
mean(Datos.NbClust$lavavajillas[Datos.NbClust$solucion.cluster == 2]),
mean(Datos.NbClust$telefono[Datos.NbClust$solucion.cluster == 2])
),
Pruebas_t = c(abs(t1$statistic), abs(t2$statistic), abs(t3$statistic),
abs(t4$statistic), abs(t5$statistic), abs(t6$statistic)),
p_value = c(t1$p.value, t2$p.value, t3$p.value, t4$p.value, t5$p.value, t6$p.value),
Significativo = ifelse(c(t1$p.value, t2$p.value, t3$p.value, t4$p.value, t5$p.value, t6$p.value) < 0.01, "**",
ifelse(c(t1$p.value, t2$p.value, t3$p.value, t4$p.value, t5$p.value, t6$p.value) < 0.05, "*", ""))
)
# Mostrar la tabla SIN caracteres especiales en el caption
resultados.ttest %>%
kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados",
align = "c", digits = 2) %>%
kable_classic(font_size = 14) %>% # Removí la fuente específica
kable_styling()| Variable | Grupo_1 | Grupo_2 | Pruebas_t | p_value | Significativo |
|---|---|---|---|---|---|
| Automovil | 66.87 | 70.70 | 1.81 | 0.10 | |
| TV color | 96.82 | 98.53 | 2.52 | 0.02 |
|
| Video | 57.68 | 63.47 | 1.19 | 0.25 | |
| Microondas | 25.42 | 44.70 | 6.73 | 0.00 | ** |
| Lavavajillas | 11.81 | 22.43 | 4.61 | 0.00 | ** |
| Telefono | 80.71 | 90.23 | 3.51 | 0.00 | ** |
12. Visualización PCA de clusters
library(ggrepel)
# Análisis de Componentes Principales (PCA)
pca_result <- prcomp(datos_analisis, scale. = TRUE)
# Resumen del PCA
summary(pca_result)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8458 1.1144 0.8236 0.5791 0.44030 0.37884
## Proportion of Variance 0.5678 0.2070 0.1130 0.0559 0.03231 0.02392
## Cumulative Proportion 0.5678 0.7748 0.8879 0.9438 0.97608 1.00000
# Crear dataframe para el gráfico
df_pca <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = as.factor(solucion$cluster),
CCAA = Datos_3_3_Caso$CCAA
)
ggplot(df_pca, aes(x = PC1, y = PC2, color = Cluster, label = CCAA)) +
geom_point(size = 3) +
geom_text_repel(size = 3, max.overlaps = 20) + # Evita superposición
theme_minimal() +
labs(title = "Figura 3.14: Visualizacion de los resultados del analisis de conglomerados",
x = "Componente Principal 1",
y = "Componente Principal 2") +
scale_color_manual(values = c("1" = "blue", "2" = "red")) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
theme(plot.title = element_text(hjust = 0.5))