A32-Aplicaciones de los conglomerados

UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONÓMICAS

ESCUELA DE ECONOMÍA

Ciclo II - 2025

“Aplicación: Análisis de Clúster (Conglomerados )”

Asignatura:

Métodos para el Análisis Económico

Grupo teórico:

GT-01

Docente:

MSF Carlos Ademir Pérez Alas

Integrantes:

Rosa Audelia Hernández Herrera — HH23026

Fátima Carolina Guillén Aguilar — GA22013

José Ricardo Vides Hernández — VH22011

Ciudad Universitaria, San Salvador – 30 de noviembre de 2025

Ejercicio

Importación de datos

library(readxl)
Data <- read_excel("~/Data_A32.xlsx")
nombres_ccaa <-  Data$ccaa
datos_numericos <- Data[, -1 ]
rownames(datos_numericos)  <- nombres_ccaa
head(datos_numericos)

## # A tibble: 6 × 6
##   automovil tvcolor video microondas lavavajilla telefono
##       <dbl>   <dbl> <dbl>      <dbl>       <dbl>    <dbl>
## 1      69      97.6  62.4       32.3        17       85.2
## 2      66.7    98    82.7       24.1        12.7     74.7
## 3      67.2    97.5  56.8       43.4        20.6     88.4
## 4      63.7    95.2  52.1       24.4        13.3     88.1
## 5      71.9    98.8  62.4       29.8        10.1     87.9
## 6      72.7    96.8  68.4       27.9         5.8     75.4

1.Outliers

Análisis de la existencia de outliers en la medida en que pueden generar importantes distorsiones en la detección del número de grupos

mean <- colMeans(datos_numericos)
Sx <- cov(datos_numericos)
D2 <- mahalanobis(datos_numericos, mean, Sx)


# p-values usando Chi-cuadrado con df = número de variables
p_values <- 1 - pchisq(D2, df = ncol(datos_numericos))

outliers <- data.frame(D2 = round(D2, 2), 
                       p_value = round(p_values, 2))
outliers

##                    D2 p_value
## España           0.20    1.00
## Andalucia       10.52    0.10
## Aragon           1.91    0.93
## Asturias         4.46    0.61
## Balerares        5.70    0.46
## Canarias         9.58    0.14
## Cantabria        7.29    0.29
## Castilla y León  2.21    0.90
## Cast-La Mancha   3.54    0.74
## Cataluña         2.95    0.82
## Com. Valenciana  2.65    0.85
## Extremadura     10.43    0.11
## Galicia         13.24    0.04
## Madrid           8.31    0.22
## Murcia           4.88    0.56
## Navarra          7.65    0.26
## País vasco       2.32    0.89
## La Rioja         4.17    0.65

library(cluster)

# Distancia euclidiana
matriz.dis.euclid <- dist(datos_numericos, method = "euclidean")

2.Conglomerados Jerárquicos

Realización de un análisis de conglomerados jerárquicos, evaluando la solución de distintos métodos de conglomeración, aplicando los criterios presentados para identificar el número adecuado de grupos y obtención de los centroides que han de servir de partida para el paso siguiente.

# Métodos de agrupamiento jerárquico

# Método Average

 hclus.average.caso3 <- hclust(matriz.dis.euclid, method = "average")
 data.frame(hclus.average.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.339233      -1       3
## 5   9.387225     -16     -17
## 6  12.380548       1       5
## 7  12.448695      -9     -12
## 8  12.659790      -7       2
## 9  13.331971     -15       4
## 10 14.684330      -6       9
## 11 16.616569     -10       6
## 12 18.923865     -13       8
## 13 20.140313       7      12
## 14 22.109739      10      11
## 15 26.042489      13      14
## 16 31.452385      -2      15
## 17 39.104937     -14      16

 plot(hclus.average.caso3,labels=Data$ccaa)

# Método Ward.D2
hclus.ward.caso3 <- hclust(matriz.dis.euclid, method = "ward.D2")
data.frame(hclus.ward.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.387225     -16     -17
## 5   9.853764      -1       3
## 6  12.448695      -9     -12
## 7  14.238563      -7       2
## 8  15.114011     -15       5
## 9  15.686087     -10       4
## 10 16.592408      -6       8
## 11 19.584858       1       9
## 12 22.402976     -13       7
## 13 27.124565      -2      10
## 14 27.214089       6      12
## 15 30.523870     -14      11
## 16 50.718899      13      14
## 17 70.785686      15      16

plot(hclus.ward.caso3,labels=Data$ccaa)

# Metodo Complete

hclus.complete.caso3 <- hclust(matriz.dis.euclid, method = "complete")
data.frame(hclus.complete.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.387225     -16     -17
## 5  10.252317      -1       3
## 6  12.448695      -9     -12
## 7  14.598288     -15       5
## 8  14.798649      -7       2
## 9  14.940214     -10       4
## 10 17.063704      -6       7
## 11 21.199764       1       9
## 12 21.923731     -13       6
## 13 25.010598       8      12
## 14 25.558951      -2      10
## 15 31.948239     -14      11
## 16 43.448130      13      14
## 17 59.937134      15      16

plot(hclus.complete.caso3,labels=Data$ccaa)

# Metodo single
hclus.single.caso3 <- hclust(matriz.dis.euclid, method = "single")
data.frame(hclus.single.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   8.426150      -1       3
## 5   9.387225     -16     -17
## 6   9.497368       1       5
## 7  10.520932      -7       2
## 8  11.179445      -6       4
## 9  12.345039     -15       8
## 10 12.448695      -9     -12
## 11 12.449498     -10       9
## 12 12.568612       6       7
## 13 12.748333      11      12
## 14 13.884884      10      13
## 15 15.236469     -13      14
## 16 17.449069      -2      15
## 17 19.176809     -14      16

plot(hclus.single.caso3,labels=Data$ccaa)

# Metodod Centroid 

hclus.centroid.caso3 <- hclust(matriz.dis.euclid, method = "centroid")
data.frame(hclus.centroid.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   7.387792      -1       3
## 5   9.387225     -16     -17
## 6   8.315094       1       5
## 7  10.389278     -15       4
## 8  10.529321      -6       7
## 9  10.871460      -7       2
## 10 10.543548       8       9
## 11 11.542734       6      10
## 12 11.284691     -10      11
## 13 12.448695      -9     -12
## 14 16.023631      12      13
## 15 18.607642     -13      14
## 16 21.157165      -2      15
## 17 28.243987     -14      16

plot(hclus.centroid.caso3,labels=Data$ccaa)

# Generación de indices propuestos por NbClust

# Ward.D2
library(NbClust)
res.wardD2 <- NbClust(datos_numericos, distance = "euclidean",
                      min.nc = 2, max.nc = 15,
                      method ="ward.D2",
                      index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

# Complete

res.complete <- NbClust(datos_numericos, distance = "euclidean",
                      min.nc = 2, max.nc = 15,
                      method ="complete",
                      index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

# Average
res.average <- NbClust(datos_numericos, distance = "euclidean",
                      min.nc = 2, max.nc = 15,
                      method ="average",
                      index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 5 proposed 4 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  15 
##  
##  
## *******************************************************************

# Centroides (Media de las 6 variables)
grupo.ward<-cutree(hclus.ward.caso3, k =2, h=NULL)
datos.caso3.grupos<-cbind(Data,grupo.ward) 
datos.caso3.grupos$ccaa <- NULL

round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos3

3. Conglomerados no jerárquico

Realización de un análisis de conglomerados no jerárquico mediante el método de k-medias para la obtención de una solución óptima en términos de homogeneidad intrasegmentos y heterogeneidad intersegmentos

library(stats)
c1<-c(66.87,96.82,57.68,25.42,11.81,80.71) 
c2<-c(70.70,98.53,63.47,44.70,22.43,90.23)  
solucion <- kmeans(datos_numericos,rbind(c1,c2))
 
solucion

## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   automovil  tvcolor    video microondas lavavajilla telefono
## 1  66.86667 96.81667 57.67500     25.425    11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700    22.43333 90.23333
## 
## Clustering vector:
##          España       Andalucia          Aragon        Asturias       Balerares 
##               1               1               2               1               1 
##        Canarias       Cantabria Castilla y León  Cast-La Mancha        Cataluña 
##               1               1               1               1               2 
## Com. Valenciana     Extremadura         Galicia          Madrid          Murcia 
##               1               1               1               2               1 
##         Navarra      País vasco        La Rioja 
##               2               2               2 
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Dendrogramas 

plot(hclus.ward.caso3,labels=Data$ccaa)
grupos1 <- cutree(hclus.ward.caso3, k = 2)
rect.hclust(hclus.ward.caso3, k = 2, border = "red")

 plot(hclus.average.caso3,labels=Data$ccaa)
 grupos2 <- cutree(hclus.average.caso3, k = 2)
 rect.hclust(hclus.average.caso3, k = 2, border = "red")

plot(hclus.complete.caso3,labels=Data$ccaa)
 grupos3 <- cutree(hclus.complete.caso3, k = 2)
 rect.hclust(hclus.complete.caso3, k = 2, border = "red")

#Pruebas T para cada variable dependiente 
solucion.cluster<-solucion$cluster
t1<-t.test(automovil~solucion.cluster, data = datos_numericos)
t2<-t.test(tvcolor~solucion.cluster, data = datos_numericos)
t3<-t.test(video~solucion.cluster, data = datos_numericos)
t4<-t.test(microondas~solucion.cluster, data = datos_numericos)
t5<-t.test(lavavajilla~solucion.cluster, data = datos_numericos)
t6<-t.test(telefono~solucion.cluster, data = datos_numericos)

library(knitr)
library(kableExtra)

vars <- c("automovil", "tvcolor", "video", "microondas", "lavavajilla", "telefono")


tabla_resultados <- data.frame(
  Variable = vars,
  Grupo1 = as.numeric(datos3[1, vars]),
  Grupo2 = as.numeric(datos3[2, vars]),
  Pruebas_t = c(abs(t1$statistic),
                abs(t2$statistic),
                abs(t3$statistic),
                abs(t4$statistic),
                abs(t5$statistic),
                abs(t6$statistic))
  
)

tabla_resultados %>%
  kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados",
        align = "c" ,digits = 2) %>%
   kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

Significatividad de las diferencias entre los perfiles de los conglomerados
Variable	Grupo1	Grupo2	Pruebas_t
automovil	66.87	70.70	1.81
tvcolor	96.82	98.53	2.52
video	57.68	63.47	1.19
microondas	25.42	44.70	6.73
lavavajilla	11.81	22.43	4.61
telefono	80.71	90.23	3.51

# Vizualización 
library(cluster)  
library(factoextra) 
vizu <- data.frame(datos_numericos, solucion$cluster)

 
colnames(vizu)[ncol(vizu)] <- "Cluster"

# Visualizar el resultado 
clusplot(vizu[,1:(ncol(vizu)-1)], 
         vizu$Cluster, 
         color = TRUE, 
         shade = TRUE,
         labels = 2, 
         lines = 0, 
         main = "Clusters K-means",
         xlab = "Componente 1", 
         ylab = "Componente 2")