Aplicación: Análisis de Clúster (Conglomerados)

UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONÓMICAS

ESCUELA DE ECONOMÍA

MÉTODOS PARA EL ANÁLISIS ECONÓMICO

TEMA:

“Aplicación: Análisis de Clúster (Conglomerados) ”

DOCENTE:

MSF. Carlos Ademir Pérez Alas.

Grupo teórico

GT-03

Integrantes:	Carnet:	Participación
Bonilla Amaya, Favio Andres	BA22008	100%
Umanzor Bonilla, Dina Esmeralda	UB22001	100%
Rodas Velasquez, Fernanda Elizabeth	RV22043	100%

Ciudad Universitaria, 10 de enero de 2025

Formación de la tabla

library(dplyr)
library(knitr)
library(kableExtra)

datos3 <- tibble::tribble(
  ~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
  "Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
  "Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
  "Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
  "Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
  "Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
  "Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
  "Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
  "Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
  "Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
  "Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
  "Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
  "Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
  "Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
  "Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
  "Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
  "Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
  "Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
  "La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)
datos3 %>% kbl(caption = "Tabla de Datos",
        align = "c", digits = 3) %>% 
  kable_styling(html_font = "Times New Roman", bootstrap_options = "striped",
                full_width = TRUE)

Tabla de Datos
CC.AA.	Automovil	TV-Color	Video	Microondas	Lavajillas	Telefono
Espana	69.0	97.6	62.4	32.3	17.0	85.2
Andalucia	66.7	98.0	82.7	24.1	12.7	74.7
Aragon	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Balerares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y Leon	65.8	97.1	47.7	28.1	14.0	85.0
Cast.-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluna	70.4	98.1	71.1	36.8	19.8	92.2
Com.-Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
Pais Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4

Clustering jerárquico

# Crear matriz de distancias (excluyendo la columna "CC.AA.")
matriz.distancias <- dist(datos3[,-1], method = "euclidean")
hclust.average.caso3 <- hclust(matriz.distancias, method = "average")
plot(hclust.average.caso3, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Promedio)")

Para encontrar el número óptimo de clusters

Datos.NbClust<- datos3[,c("Automovil","TV-Color","Video","Microondas","Lavajillas","Telefono")]
res.wardD2 <- NbClust(Datos.NbClust,distance = "euclidean", min.nc=2,max.nc=15, method = "ward.D2", index ="alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Método Ward

# Método Ward
hclust.ward <- hclust(matriz.distancias, method = "ward.D2")
plot(hclust.ward, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Ward)")

Método Centroide

hclust.centroid <- hclust(matriz.distancias, method = "centroid")
plot(hclust.centroid, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Centroide)")

Método Complete (Enlace completo)

hclust.complete <- hclust(matriz.distancias, method = "complete")
plot(hclust.complete, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Complete)")

Método Single (Enlace simple)

hclust.single <- hclust(matriz.distancias, method = "single")
plot(hclust.single, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Single)")

Datos obtenidos del método de conglomeracióon Ward

grupo.ward <- cutree(hclust.ward, k =2, h= NULL)
datos.caso3.grupos<- cbind(datos3, grupo.ward)
datos.caso3.grupos$id<- NULL

Obtenemos los centroides

round(aggregate(datos.caso3.grupos,list(grupo.ward), mean),2)

##   Group.1 CC.AA. Automovil TV-Color Video Microondas Lavajillas Telefono
## 1       1     NA     66.87    96.82 57.68      25.42      11.81    80.71
## 2       2     NA     70.70    98.53 63.47      44.70      22.43    90.23
##   grupo.ward
## 1          1
## 2          2

Salida donde la solución del jerárquico coincide con la del no jerárquico

c1<- c(66.87,96.82,56.01, 25.43,11.81,80.71)
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23)
datos.caso3.grupos.kmeans <- datos3[, -1]
solucion<- kmeans(datos.caso3.grupos.kmeans,rbind(c1,c2))
solucion

## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automovil TV-Color    Video Microondas Lavajillas Telefono
## 1  66.86667 96.81667 57.67500     25.425   11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700   22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

datos.caso3.grupos.kmeans$cluster <- factor(solucion$cluster)
t.test(Automovil ~ cluster, 
       data = datos.caso3.grupos.kmeans)

## 
##  Welch Two Sample t-test
## 
## data:  Automovil by cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -8.5449256  0.8782589
## sample estimates:
## mean in group 1 mean in group 2 
##        66.86667        70.70000