A32-Aplicaciones de los conglomerados
UNIVERSIDAD DE EL SALVADOR
FACULTAD DE CIENCIAS ECONÓMICAS
ESCUELA DE ECONOMÍA
Ciclo II - 2025
“Aplicación: Análisis de Clúster (Conglomerados )”
Asignatura:
Métodos para el Análisis Económico
Grupo teórico:
GT-01
Docente:
MSF Carlos Ademir Pérez Alas
Integrantes:
Rosa Audelia Hernández Herrera — HH23026
Fátima Carolina Guillén Aguilar — GA22013
José Ricardo Vides Hernández — VH22011
Ciudad Universitaria, San Salvador – 30 de noviembre de 2025
Ejercicio
Importación de datos
library(readxl)
Data <- read_excel("~/Data_A32.xlsx")
nombres_ccaa <- Data$ccaa
datos_numericos <- Data[, -1 ]
rownames(datos_numericos) <- nombres_ccaa
head(datos_numericos)## # A tibble: 6 × 6
## automovil tvcolor video microondas lavavajilla telefono
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 69 97.6 62.4 32.3 17 85.2
## 2 66.7 98 82.7 24.1 12.7 74.7
## 3 67.2 97.5 56.8 43.4 20.6 88.4
## 4 63.7 95.2 52.1 24.4 13.3 88.1
## 5 71.9 98.8 62.4 29.8 10.1 87.9
## 6 72.7 96.8 68.4 27.9 5.8 75.4
1.Outliers
Análisis de la existencia de outliers en la medida en que pueden generar importantes distorsiones en la detección del número de grupos
mean <- colMeans(datos_numericos)
Sx <- cov(datos_numericos)
D2 <- mahalanobis(datos_numericos, mean, Sx)
# p-values usando Chi-cuadrado con df = número de variables
p_values <- 1 - pchisq(D2, df = ncol(datos_numericos))
outliers <- data.frame(D2 = round(D2, 2),
p_value = round(p_values, 2))
outliers## D2 p_value
## España 0.20 1.00
## Andalucia 10.52 0.10
## Aragon 1.91 0.93
## Asturias 4.46 0.61
## Balerares 5.70 0.46
## Canarias 9.58 0.14
## Cantabria 7.29 0.29
## Castilla y León 2.21 0.90
## Cast-La Mancha 3.54 0.74
## Cataluña 2.95 0.82
## Com. Valenciana 2.65 0.85
## Extremadura 10.43 0.11
## Galicia 13.24 0.04
## Madrid 8.31 0.22
## Murcia 4.88 0.56
## Navarra 7.65 0.26
## País vasco 2.32 0.89
## La Rioja 4.17 0.65
2.Conglomerados Jerárquicos
Realización de un análisis de conglomerados jerárquicos, evaluando la solución de distintos métodos de conglomeración, aplicando los criterios presentados para identificar el número adecuado de grupos y obtención de los centroides que han de servir de partida para el paso siguiente.
# Métodos de agrupamiento jerárquico
# Método Average
hclus.average.caso3 <- hclust(matriz.dis.euclid, method = "average")
data.frame(hclus.average.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.339233 -1 3
## 5 9.387225 -16 -17
## 6 12.380548 1 5
## 7 12.448695 -9 -12
## 8 12.659790 -7 2
## 9 13.331971 -15 4
## 10 14.684330 -6 9
## 11 16.616569 -10 6
## 12 18.923865 -13 8
## 13 20.140313 7 12
## 14 22.109739 10 11
## 15 26.042489 13 14
## 16 31.452385 -2 15
## 17 39.104937 -14 16
# Método Ward.D2
hclus.ward.caso3 <- hclust(matriz.dis.euclid, method = "ward.D2")
data.frame(hclus.ward.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.387225 -16 -17
## 5 9.853764 -1 3
## 6 12.448695 -9 -12
## 7 14.238563 -7 2
## 8 15.114011 -15 5
## 9 15.686087 -10 4
## 10 16.592408 -6 8
## 11 19.584858 1 9
## 12 22.402976 -13 7
## 13 27.124565 -2 10
## 14 27.214089 6 12
## 15 30.523870 -14 11
## 16 50.718899 13 14
## 17 70.785686 15 16
# Metodo Complete
hclus.complete.caso3 <- hclust(matriz.dis.euclid, method = "complete")
data.frame(hclus.complete.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.387225 -16 -17
## 5 10.252317 -1 3
## 6 12.448695 -9 -12
## 7 14.598288 -15 5
## 8 14.798649 -7 2
## 9 14.940214 -10 4
## 10 17.063704 -6 7
## 11 21.199764 1 9
## 12 21.923731 -13 6
## 13 25.010598 8 12
## 14 25.558951 -2 10
## 15 31.948239 -14 11
## 16 43.448130 13 14
## 17 59.937134 15 16
# Metodo single
hclus.single.caso3 <- hclust(matriz.dis.euclid, method = "single")
data.frame(hclus.single.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 8.426150 -1 3
## 5 9.387225 -16 -17
## 6 9.497368 1 5
## 7 10.520932 -7 2
## 8 11.179445 -6 4
## 9 12.345039 -15 8
## 10 12.448695 -9 -12
## 11 12.449498 -10 9
## 12 12.568612 6 7
## 13 12.748333 11 12
## 14 13.884884 10 13
## 15 15.236469 -13 14
## 16 17.449069 -2 15
## 17 19.176809 -14 16
# Metodod Centroid
hclus.centroid.caso3 <- hclust(matriz.dis.euclid, method = "centroid")
data.frame(hclus.centroid.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 7.387792 -1 3
## 5 9.387225 -16 -17
## 6 8.315094 1 5
## 7 10.389278 -15 4
## 8 10.529321 -6 7
## 9 10.871460 -7 2
## 10 10.543548 8 9
## 11 11.542734 6 10
## 12 11.284691 -10 11
## 13 12.448695 -9 -12
## 14 16.023631 12 13
## 15 18.607642 -13 14
## 16 21.157165 -2 15
## 17 28.243987 -14 16
# Generación de indices propuestos por NbClust
# Ward.D2
library(NbClust)
res.wardD2 <- NbClust(datos_numericos, distance = "euclidean",
min.nc = 2, max.nc = 15,
method ="ward.D2",
index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# Complete
res.complete <- NbClust(datos_numericos, distance = "euclidean",
min.nc = 2, max.nc = 15,
method ="complete",
index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# Average
res.average <- NbClust(datos_numericos, distance = "euclidean",
min.nc = 2, max.nc = 15,
method ="average",
index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 5 proposed 4 as the best number of clusters
## * 3 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 15
##
##
## *******************************************************************
3. Conglomerados no jerárquico
Realización de un análisis de conglomerados no jerárquico mediante el método de k-medias para la obtención de una solución óptima en términos de homogeneidad intrasegmentos y heterogeneidad intersegmentos
library(stats)
c1<-c(66.87,96.82,57.68,25.42,11.81,80.71)
c2<-c(70.70,98.53,63.47,44.70,22.43,90.23)
solucion <- kmeans(datos_numericos,rbind(c1,c2))
solucion## K-means clustering with 2 clusters of sizes 12, 6
##
## Cluster means:
## automovil tvcolor video microondas lavavajilla telefono
## 1 66.86667 96.81667 57.67500 25.425 11.80833 80.70833
## 2 70.70000 98.53333 63.46667 44.700 22.43333 90.23333
##
## Clustering vector:
## España Andalucia Aragon Asturias Balerares
## 1 1 2 1 1
## Canarias Cantabria Castilla y León Cast-La Mancha Cataluña
## 1 1 1 1 2
## Com. Valenciana Extremadura Galicia Madrid Murcia
## 1 1 1 2 1
## Navarra País vasco La Rioja
## 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2810.6467 848.3533
## (between_SS / total_SS = 40.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Dendrogramas
plot(hclus.ward.caso3,labels=Data$ccaa)
grupos1 <- cutree(hclus.ward.caso3, k = 2)
rect.hclust(hclus.ward.caso3, k = 2, border = "red") plot(hclus.average.caso3,labels=Data$ccaa)
grupos2 <- cutree(hclus.average.caso3, k = 2)
rect.hclust(hclus.average.caso3, k = 2, border = "red")plot(hclus.complete.caso3,labels=Data$ccaa)
grupos3 <- cutree(hclus.complete.caso3, k = 2)
rect.hclust(hclus.complete.caso3, k = 2, border = "red")#Pruebas T para cada variable dependiente
solucion.cluster<-solucion$cluster
t1<-t.test(automovil~solucion.cluster, data = datos_numericos)
t2<-t.test(tvcolor~solucion.cluster, data = datos_numericos)
t3<-t.test(video~solucion.cluster, data = datos_numericos)
t4<-t.test(microondas~solucion.cluster, data = datos_numericos)
t5<-t.test(lavavajilla~solucion.cluster, data = datos_numericos)
t6<-t.test(telefono~solucion.cluster, data = datos_numericos)library(knitr)
library(kableExtra)
vars <- c("automovil", "tvcolor", "video", "microondas", "lavavajilla", "telefono")
tabla_resultados <- data.frame(
Variable = vars,
Grupo1 = as.numeric(datos3[1, vars]),
Grupo2 = as.numeric(datos3[2, vars]),
Pruebas_t = c(abs(t1$statistic),
abs(t2$statistic),
abs(t3$statistic),
abs(t4$statistic),
abs(t5$statistic),
abs(t6$statistic))
)
tabla_resultados %>%
kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados",
align = "c" ,digits = 2) %>%
kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()| Variable | Grupo1 | Grupo2 | Pruebas_t |
|---|---|---|---|
| automovil | 66.87 | 70.70 | 1.81 |
| tvcolor | 96.82 | 98.53 | 2.52 |
| video | 57.68 | 63.47 | 1.19 |
| microondas | 25.42 | 44.70 | 6.73 |
| lavavajilla | 11.81 | 22.43 | 4.61 |
| telefono | 80.71 | 90.23 | 3.51 |
# Vizualización
library(cluster)
library(factoextra)
vizu <- data.frame(datos_numericos, solucion$cluster)
colnames(vizu)[ncol(vizu)] <- "Cluster"
# Visualizar el resultado
clusplot(vizu[,1:(ncol(vizu)-1)],
vizu$Cluster,
color = TRUE,
shade = TRUE,
labels = 2,
lines = 0,
main = "Clusters K-means",
xlab = "Componente 1",
ylab = "Componente 2")