Análisis de Clúster (Conglomerados)

UNIVERSIDAD DE EL SALVADOR
FACULTAD DE CIENCIAS ECONÓMICAS
ESCUELA DE ECONOMÍA
Ciclo II - 2025


“Tarea A32: Aplicación: Análisis de Clúster (Conglomerados )”

Asignatura: Métodos para el Análisis Económico

Grupo teórico: GT-01

Docente: MSF Carlos Ademir Pérez Alas


Integrantes:

   Jaime Elías Barahona Martínez           BM23012
  Jefferson Josué Sandoval Pérez           SP23029
Mariana Ivette Zelaya Castillo                ZC23003

Ciudad Universitaria, San Salvador – 30 de noviembre de 2025

1. Carga de datos y preparación

# Cargar librerías necesarias
library(NbClust)
library(cluster)
library(factoextra)
library(ggplot2)
library(dplyr)

# Crear el dataframe con los datos del cuadro 3.22
Datos_3_3_Caso <- data.frame(
  id = 1:18,
  CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias",
           "Cantabria", "Castilla y León", "Cast.-La Mancha", "Cataluña",
           "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
           "Navarra", "País Vasco", "La Rioja"),
  automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4,
                72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
  tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1,
              98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
  video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1,
            68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
  microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8,
                 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
  lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8,
                   12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
  telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2,
               84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)

# Preparar datos para el análisis (solo las variables numéricas)
datos_analisis <- Datos_3_3_Caso[, c("automovil", "tvcolor", "video", 
                                    "microondas", "lavavajillas", "telefono")]
rownames(datos_analisis) <- Datos_3_3_Caso$CCAA
print(Datos_3_3_Caso)
##    id            CCAA automovil tvcolor video microondas lavavajillas telefono
## 1   1          España      69.0    97.6  62.4       32.3         17.0     85.2
## 2   2       Andalucía      66.7    98.0  82.7       24.1         12.7     74.7
## 3   3          Aragón      67.2    97.5  56.8       43.4         20.6     88.4
## 4   4        Asturias      63.7    95.2  52.1       24.4         13.3     88.1
## 5   5        Baleares      71.9    98.8  62.4       29.8         10.1     87.9
## 6   6        Canarias      72.7    96.8  68.4       27.9          5.8     75.4
## 7   7       Cantabria      63.4    94.9  48.9       36.5         11.2     80.5
## 8   8 Castilla y León      65.8    97.1  47.7       28.1         14.0     85.0
## 9   9 Cast.-La Mancha      61.5    97.3  53.6       21.7          7.1     72.9
## 10 10        Cataluña      70.4    98.1  71.1       36.8         19.8     92.2
## 11 11 Com. Valenciana      72.7    98.4  68.2       26.6         12.1     84.4
## 12 12     Extremadura      60.5    97.7  43.7       20.7         11.7     67.1
## 13 13         Galicia      65.5    91.3  42.7       13.5         14.6     85.9
## 14 14          Madrid      74.0    99.4  76.3       53.9         32.3     95.7
## 15 15          Murcia      69.0    98.7  59.3       19.5         12.1     81.4
## 16 16         Navarra      76.4    99.3  60.6       44.0         20.6     87.4
## 17 17      País Vasco      71.3    98.3  61.6       45.7         23.7     94.3
## 18 18        La Rioja      64.9    98.6  54.4       44.4         17.6     83.4

2. Detección de outliers mediante distancia de Mahalanobis

# Calcular distancias de Mahalanobis
mahal_dist <- mahalanobis(datos_analisis, 
                         colMeans(datos_analisis), 
                         cov(datos_analisis))

# Calcular p-valores (chi-cuadrado con 6 grados de libertad)
p_values <- pchisq(mahal_dist, df = 6, lower.tail = FALSE)

# Crear tabla de resultados (Cuadro 3.23)
resultados_outliers <- data.frame(
  CCAA = Datos_3_3_Caso$CCAA,
  D2 = round(mahal_dist, 2),
  p_value = round(p_values, 2)
)

print("Cuadro 3.23: Resultados de la detección de outliers")
## [1] "Cuadro 3.23: Resultados de la detección de outliers"
print(resultados_outliers)
##                            CCAA    D2 p_value
## España                   España  0.20    1.00
## Andalucía             Andalucía 10.52    0.10
## Aragón                   Aragón  1.91    0.93
## Asturias               Asturias  4.46    0.61
## Baleares               Baleares  5.70    0.46
## Canarias               Canarias  9.58    0.14
## Cantabria             Cantabria  7.29    0.29
## Castilla y León Castilla y León  2.21    0.90
## Cast.-La Mancha Cast.-La Mancha  3.54    0.74
## Cataluña               Cataluña  2.95    0.82
## Com. Valenciana Com. Valenciana  2.65    0.85
## Extremadura         Extremadura 10.43    0.11
## Galicia                 Galicia 13.24    0.04
## Madrid                   Madrid  8.31    0.22
## Murcia                   Murcia  4.88    0.56
## Navarra                 Navarra  7.65    0.26
## País Vasco           País Vasco  2.32    0.89
## La Rioja               La Rioja  4.17    0.65

3. Análisis de conglomerados jerárquicos

# Calcular matriz de distancias euclídeas
matriz.dis.euclid.caso3 <- dist(datos_analisis, method = "euclidean")

# Aplicar diferentes métodos de conglomeración
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3, method = "single")
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3, method = "centroid")

# Explorar estructura del resultado average
print("Estructura hclust.average.caso3:")
## [1] "Estructura hclust.average.caso3:"
data.frame(hclust.average.caso3[2:1])
##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.339233      -1       3
## 5   9.387225     -16     -17
## 6  12.380548       1       5
## 7  12.448695      -9     -12
## 8  12.659790      -7       2
## 9  13.331971     -15       4
## 10 14.684330      -6       9
## 11 16.616569     -10       6
## 12 18.923865     -13       8
## 13 20.140313       7      12
## 14 22.109739      10      11
## 15 26.042489      13      14
## 16 31.452385      -2      15
## 17 39.104937     -14      16
# Visualizar dendogramas (Figura 3.13)
par(mfrow = c(2, 3))
plot(hclust.ward.caso3, main = "Método Ward", hang = -1, cex = 0.6)
plot(hclust.complete.caso3, main = "Método Complete", hang = -1, cex = 0.6)
plot(hclust.average.caso3, main = "Método Average", hang = -1, cex = 0.6)
plot(hclust.single.caso3, main = "Método Single", hang = -1, cex = 0.6)
plot(hclust.centroid.caso3, main = "Método Centroid", hang = -1, cex = 0.6)
par(mfrow = c(1, 1))

# Colocados los mismos anteriores pero UNO POR UNO, para mejor visualización
plot(hclust.ward.caso3, main = "Método Ward", hang = -1, cex = 0.6)

plot(hclust.complete.caso3, main = "Método Complete", hang = -1, cex = 0.6)

plot(hclust.average.caso3, main = "Método Average", hang = -1, cex = 0.6)

plot(hclust.single.caso3, main = "Método Single", hang = -1, cex = 0.6)

plot(hclust.centroid.caso3, main = "Método Centroid", hang = -1, cex = 0.6)

4. Determinación del número óptimo de clusters

# Preparar datos para NbClust
Datos.NbClust <- Datos_3_3_Caso[, c("automovil", "tvcolor", "video", 
                                   "microondas", "lavavajillas", "telefono")]

# Aplicar NbClust para diferentes métodos
set.seed(123)
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean",
                     min.nc = 2, max.nc = 15, method = "ward.D2", index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 6 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
res.complete <- NbClust(Datos.NbClust, distance = "euclidean",
                       min.nc = 2, max.nc = 15, method = "complete", index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 6 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
res.average <- NbClust(Datos.NbClust, distance = "euclidean",
                      min.nc = 2, max.nc = 15, method = "average", index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 4 proposed 4 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 6 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
# Mostrar resultados
print("Número óptimo de clusters según Ward.D2:")
## [1] "Número óptimo de clusters según Ward.D2:"
print(res.wardD2$Best.nc)
##                     KL      CH Hartigan    CCC   Scott     Marriot   TrCovW
## Number_clusters 3.0000 15.0000   3.0000  2.000  13.000 3.00000e+00      3.0
## Value_Index     3.0067 16.3636   5.0086 13.675 496.289 2.72309e+14 417427.7
##                 TraceW     Friedman    Rubin Cindex      DB Silhouette   Duda
## Number_clusters   3.00 1.500000e+01   3.0000 2.0000 15.0000    15.0000 2.0000
## Value_Index     820.35 1.237259e+16 -21.4006 0.3702  0.2904     0.7667 0.5504
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  3.0000     3.000    3.000     4.0000    1  2.0000
## Value_Index       8.1671 -2.3568     0.434 1038.568     0.5195   NA  0.6077
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 15.0000      0  7.0000      0 15.0000
## Value_Index      1.0795      0  0.1632      0  0.0087
print("Número óptimo de clusters según Complete:")
## [1] "Número óptimo de clusters según Complete:"
print(res.complete$Best.nc)
##                     KL      CH Hartigan    CCC   Scott     Marriot   TrCovW
## Number_clusters 3.0000 15.0000   3.0000  2.000  13.000 3.00000e+00      3.0
## Value_Index     3.0067 16.3636   5.0086 13.675 496.289 2.72309e+14 417427.7
##                 TraceW     Friedman    Rubin Cindex      DB Silhouette   Duda
## Number_clusters   3.00 1.500000e+01   3.0000 2.0000 15.0000    15.0000 2.0000
## Value_Index     820.35 1.237259e+16 -21.4006 0.3702  0.2904     0.7667 0.5504
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  3.0000     3.000    3.000     5.0000    1  2.0000
## Value_Index       8.1671 -2.3568     0.434 1038.568     0.5241   NA  0.6077
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 15.0000      0  7.0000      0 15.0000
## Value_Index      1.0795      0  0.1632      0  0.0087
print("Número óptimo de clusters según Average:")
## [1] "Número óptimo de clusters según Average:"
print(res.average$Best.nc)
##                      KL      CH Hartigan     CCC    Scott      Marriot   TrCovW
## Number_clusters 13.0000 15.0000   3.0000  2.0000  13.0000 4.000000e+00      3.0
## Value_Index      2.9006 16.3636   7.0163 12.5207 473.7819 9.861949e+14 330725.1
##                   TraceW     Friedman     Rubin Cindex      DB Silhouette
## Number_clusters   4.0000 1.500000e+01   13.0000 3.0000 15.0000    15.0000
## Value_Index     694.2515 1.208339e+16 -218.5282 0.4862  0.2904     0.7667
##                   Duda PseudoT2   Beale Ratkowsky     Ball PtBiserial   Frey
## Number_clusters 2.0000   2.0000  2.0000    4.0000    3.000     4.0000 3.0000
## Value_Index     2.0294  -7.6087 -1.8296    0.3712 1057.447     0.5395 1.4791
##                 McClain    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters  2.0000 15.0000      0  3.0000      0 15.0000
## Value_Index      0.0738  1.0795      0  0.1368      0  0.0087

5. Obtención de centroides del método jerárquico

library(kableExtra)
# Asignar clusters usando Ward (2 grupos)
grupo.ward <- cutree(hclust.ward.caso3, k = 2, h=NULL)

# Añadir clusters al dataframe
datos.caso3.grupos <- cbind(Datos_3_3_Caso, grupo.ward)
datos.caso3.grupos$id<-NULL
datos.caso3.grupos$CCAA<-NULL


round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3
head(datos.caso3)%>% kable(caption = "Grupos: Centroides resultantes del metodo jerarquico", align = "c",digits = 2) %>%  kable_classic(html_font = "Montserrat", font_size = 14) %>%  kable_styling()
Grupos: Centroides resultantes del metodo jerarquico
Group.1 automovil tvcolor video microondas lavavajillas telefono grupo.ward
1 66.87 96.82 57.68 25.42 11.81 80.71 1
2 70.70 98.53 63.47 44.70 22.43 90.23 2

6. Aplicación del método k-means

library(stats)

datos.caso3.grupos.kmeans <- datos.caso3.grupos[, 1:6]

c1<-c(66.87,96.82,56.01 ,25.43,11.81,80.71) 
c2<-c(70.70,98.53,63.47,44.70,22.43,90.23) 
solucion <- kmeans(datos.caso3.grupos.kmeans,rbind(c1,c2))
solucion
## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   automovil  tvcolor    video microondas lavavajillas telefono
## 1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##          España       Andalucía          Aragón        Asturias        Baleares 
##               1               1               2               1               1 
##        Canarias       Cantabria Castilla y León Cast.-La Mancha        Cataluña 
##               1               1               1               1               2 
## Com. Valenciana     Extremadura         Galicia          Madrid          Murcia 
##               1               1               1               2               1 
##         Navarra      País Vasco        La Rioja 
##               2               2               2 
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

7.Visualización: Dendrograma método Ward (2 clusters)

hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")

plot(hclust.ward.caso3, labels = Datos_3_3_Caso$id)

plot(hclust.ward.caso3, labels = Datos_3_3_Caso$CCAA)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "purple")

8. Visualización: Dendrograma método Complete (2 clusters)

hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")

plot(hclust.ward.caso3, labels = Datos_3_3_Caso$id)

plot(hclust.ward.caso3, labels = Datos_3_3_Caso$CCAA)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "red")

9. Visualización: Dendrograma método Average (2 clusters)

hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")

plot(hclust.ward.caso3, labels = Datos_3_3_Caso$id)

plot(hclust.ward.caso3, labels = Datos_3_3_Caso$CCAA)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "blue")

10. Pruebas t de diferencias entre grupos

# Primero asegurarnos de que tenemos el objeto correcto
# Añadir los clusters al dataframe
datos.caso3.grupos.kmeans$solucion.cluster <- solucion$cluster

# Verificar la estructura
print("Estructura del dataframe:")
## [1] "Estructura del dataframe:"
str(datos.caso3.grupos.kmeans)
## 'data.frame':    18 obs. of  7 variables:
##  $ automovil       : num  69 66.7 67.2 63.7 71.9 72.7 63.4 65.8 61.5 70.4 ...
##  $ tvcolor         : num  97.6 98 97.5 95.2 98.8 96.8 94.9 97.1 97.3 98.1 ...
##  $ video           : num  62.4 82.7 56.8 52.1 62.4 68.4 48.9 47.7 53.6 71.1 ...
##  $ microondas      : num  32.3 24.1 43.4 24.4 29.8 27.9 36.5 28.1 21.7 36.8 ...
##  $ lavavajillas    : num  17 12.7 20.6 13.3 10.1 5.8 11.2 14 7.1 19.8 ...
##  $ telefono        : num  85.2 74.7 88.4 88.1 87.9 75.4 80.5 85 72.9 92.2 ...
##  $ solucion.cluster: int  1 1 2 1 1 1 1 1 1 2 ...
# Prueba t corregida para automóvil
t.test(automovil ~ solucion.cluster, data = datos.caso3.grupos.kmeans)
## 
##  Welch Two Sample t-test
## 
## data:  automovil by solucion.cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -8.5449256  0.8782589
## sample estimates:
## mean in group 1 mean in group 2 
##        66.86667        70.70000

11. Tabla de resultados de pruebas t

library(dplyr)
library(kableExtra)

# Preparar datos
Datos.NbClust$solucion.cluster <- solucion$cluster

# Calcular las pruebas t con nombres sin caracteres especiales
t1 <- t.test(automovil ~ solucion.cluster, data = Datos.NbClust)
t2 <- t.test(tvcolor ~ solucion.cluster, data = Datos.NbClust)
t3 <- t.test(video ~ solucion.cluster, data = Datos.NbClust)
t4 <- t.test(microondas ~ solucion.cluster, data = Datos.NbClust)
t5 <- t.test(lavavajillas ~ solucion.cluster, data = Datos.NbClust)
t6 <- t.test(telefono ~ solucion.cluster, data = Datos.NbClust)

# Crear dataframe con nombres SIN caracteres especiales
resultados.ttest <- data.frame(
  Variable = c("Automovil", "TV color", "Video", "Microondas", "Lavavajillas", "Telefono"),
  Grupo_1 = c(
    mean(Datos.NbClust$automovil[Datos.NbClust$solucion.cluster == 1]),
    mean(Datos.NbClust$tvcolor[Datos.NbClust$solucion.cluster == 1]),
    mean(Datos.NbClust$video[Datos.NbClust$solucion.cluster == 1]),
    mean(Datos.NbClust$microondas[Datos.NbClust$solucion.cluster == 1]),
    mean(Datos.NbClust$lavavajillas[Datos.NbClust$solucion.cluster == 1]),
    mean(Datos.NbClust$telefono[Datos.NbClust$solucion.cluster == 1])
  ),
  Grupo_2 = c(
    mean(Datos.NbClust$automovil[Datos.NbClust$solucion.cluster == 2]),
    mean(Datos.NbClust$tvcolor[Datos.NbClust$solucion.cluster == 2]),
    mean(Datos.NbClust$video[Datos.NbClust$solucion.cluster == 2]),
    mean(Datos.NbClust$microondas[Datos.NbClust$solucion.cluster == 2]),
    mean(Datos.NbClust$lavavajillas[Datos.NbClust$solucion.cluster == 2]),
    mean(Datos.NbClust$telefono[Datos.NbClust$solucion.cluster == 2])
  ),
  Pruebas_t = c(abs(t1$statistic), abs(t2$statistic), abs(t3$statistic), 
                abs(t4$statistic), abs(t5$statistic), abs(t6$statistic)),
  p_value = c(t1$p.value, t2$p.value, t3$p.value, t4$p.value, t5$p.value, t6$p.value),
  Significativo = ifelse(c(t1$p.value, t2$p.value, t3$p.value, t4$p.value, t5$p.value, t6$p.value) < 0.01, "**",
                        ifelse(c(t1$p.value, t2$p.value, t3$p.value, t4$p.value, t5$p.value, t6$p.value) < 0.05, "*", ""))
)

# Mostrar la tabla SIN caracteres especiales en el caption
resultados.ttest %>%
  kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados", 
        align = "c", digits = 2) %>%
  kable_classic(font_size = 14) %>%  # Removí la fuente específica
  kable_styling()
Significatividad de las diferencias entre los perfiles de los conglomerados
Variable Grupo_1 Grupo_2 Pruebas_t p_value Significativo
Automovil 66.87 70.70 1.81 0.10
TV color 96.82 98.53 2.52 0.02
Video 57.68 63.47 1.19 0.25
Microondas 25.42 44.70 6.73 0.00 **
Lavavajillas 11.81 22.43 4.61 0.00 **
Telefono 80.71 90.23 3.51 0.00 **

12. Visualización PCA de clusters

library(ggrepel)

# Análisis de Componentes Principales (PCA)
pca_result <- prcomp(datos_analisis, scale. = TRUE)

# Resumen del PCA
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.8458 1.1144 0.8236 0.5791 0.44030 0.37884
## Proportion of Variance 0.5678 0.2070 0.1130 0.0559 0.03231 0.02392
## Cumulative Proportion  0.5678 0.7748 0.8879 0.9438 0.97608 1.00000
# Crear dataframe para el gráfico
df_pca <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(solucion$cluster),
  CCAA = Datos_3_3_Caso$CCAA
)

ggplot(df_pca, aes(x = PC1, y = PC2, color = Cluster, label = CCAA)) +
  geom_point(size = 3) +
  geom_text_repel(size = 3, max.overlaps = 20) +  # Evita superposición
  theme_minimal() +
  labs(title = "Figura 3.14: Visualizacion de los resultados del analisis de conglomerados",
       x = "Componente Principal 1",
       y = "Componente Principal 2") +
  scale_color_manual(values = c("1" = "blue", "2" = "red")) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  theme(plot.title = element_text(hjust = 0.5))