Aplicación: Análisis de Clúster (Conglomerados)

UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONÓMICAS

ESCUELA DE ECONOMÍA

MÉTODOS PARA EL ANÁLISIS ECONÓMICO

image/svg+xml

TEMA:

Aplicación: Análisis de Clúster (Conglomerados)

DOCENTE:

MSF. Carlos Ademir Pérez Alas.

Grupo teórico

GT-03

Integrantes: Carnet: Participación
Bonilla Amaya, Favio Andres BA22008 100%
Umanzor Bonilla, Dina Esmeralda UB22001 100%
Rodas Velasquez, Fernanda Elizabeth RV22043 100%

Ciudad Universitaria, 10 de enero de 2025

Formación de la tabla

library(dplyr)
library(knitr)
library(kableExtra)

datos3 <- tibble::tribble(
  ~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
  "Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
  "Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
  "Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
  "Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
  "Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
  "Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
  "Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
  "Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
  "Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
  "Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
  "Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
  "Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
  "Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
  "Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
  "Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
  "Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
  "Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
  "La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)
datos3 %>% kbl(caption = "Tabla de Datos",
        align = "c", digits = 3) %>% 
  kable_styling(html_font = "Times New Roman", bootstrap_options = "striped",
                full_width = TRUE)
Tabla de Datos
CC.AA. Automovil TV-Color Video Microondas Lavajillas Telefono
Espana 69.0 97.6 62.4 32.3 17.0 85.2
Andalucia 66.7 98.0 82.7 24.1 12.7 74.7
Aragon 67.2 97.5 56.8 43.4 20.6 88.4
Asturias 63.7 95.2 52.1 24.4 13.3 88.1
Balerares 71.9 98.8 62.4 29.8 10.1 87.9
Canarias 72.7 96.8 68.4 27.9 5.8 75.4
Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
Castilla y Leon 65.8 97.1 47.7 28.1 14.0 85.0
Cast.-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
Cataluna 70.4 98.1 71.1 36.8 19.8 92.2
Com.-Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
Galicia 65.5 91.3 42.7 13.5 14.6 85.9
Madrid 74.0 99.4 76.3 53.9 32.3 95.7
Murcia 69.0 98.7 59.3 19.5 12.1 81.4
Navarra 76.4 99.3 60.6 44.0 20.6 87.4
Pais Vasco 71.3 98.3 61.6 45.7 23.7 94.3
La Rioja 64.9 98.6 54.4 44.4 17.6 83.4

Clustering jerárquico

# Crear matriz de distancias (excluyendo la columna "CC.AA.")
matriz.distancias <- dist(datos3[,-1], method = "euclidean")
hclust.average.caso3 <- hclust(matriz.distancias, method = "average")
plot(hclust.average.caso3, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Promedio)")

Para encontrar el número óptimo de clusters

Datos.NbClust<- datos3[,c("Automovil","TV-Color","Video","Microondas","Lavajillas","Telefono")]
res.wardD2 <- NbClust(Datos.NbClust,distance = "euclidean", min.nc=2,max.nc=15, method = "ward.D2", index ="alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Método Ward

# Método Ward
hclust.ward <- hclust(matriz.distancias, method = "ward.D2")
plot(hclust.ward, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Ward)")

Método Centroide

hclust.centroid <- hclust(matriz.distancias, method = "centroid")
plot(hclust.centroid, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Centroide)")

Método Complete (Enlace completo)

hclust.complete <- hclust(matriz.distancias, method = "complete")
plot(hclust.complete, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Complete)")

Método Single (Enlace simple)

hclust.single <- hclust(matriz.distancias, method = "single")
plot(hclust.single, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Single)")

Datos obtenidos del método de conglomeracióon Ward

grupo.ward <- cutree(hclust.ward, k =2, h= NULL)
datos.caso3.grupos<- cbind(datos3, grupo.ward)
datos.caso3.grupos$id<- NULL

Obtenemos los centroides

round(aggregate(datos.caso3.grupos,list(grupo.ward), mean),2)
##   Group.1 CC.AA. Automovil TV-Color Video Microondas Lavajillas Telefono
## 1       1     NA     66.87    96.82 57.68      25.42      11.81    80.71
## 2       2     NA     70.70    98.53 63.47      44.70      22.43    90.23
##   grupo.ward
## 1          1
## 2          2

Salida donde la solución del jerárquico coincide con la del no jerárquico

c1<- c(66.87,96.82,56.01, 25.43,11.81,80.71)
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23)
datos.caso3.grupos.kmeans <- datos3[, -1]
solucion<- kmeans(datos.caso3.grupos.kmeans,rbind(c1,c2))
solucion
## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automovil TV-Color    Video Microondas Lavajillas Telefono
## 1  66.86667 96.81667 57.67500     25.425   11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700   22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
datos.caso3.grupos.kmeans$cluster <- factor(solucion$cluster)
t.test(Automovil ~ cluster, 
       data = datos.caso3.grupos.kmeans)
## 
##  Welch Two Sample t-test
## 
## data:  Automovil by cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -8.5449256  0.8782589
## sample estimates:
## mean in group 1 mean in group 2 
##        66.86667        70.70000