Aplicación: Análisis de Clúster (Conglomerados)
UNIVERSIDAD DE EL SALVADOR
FACULTAD DE CIENCIAS ECONÓMICAS
ESCUELA DE ECONOMÍA
MÉTODOS PARA EL ANÁLISIS ECONÓMICO
TEMA:
“Aplicación: Análisis de Clúster (Conglomerados) ”
DOCENTE:
MSF. Carlos Ademir Pérez Alas.
Grupo teórico
GT-03
| Integrantes: | Carnet: | Participación |
|---|---|---|
| Bonilla Amaya, Favio Andres | BA22008 | 100% |
| Umanzor Bonilla, Dina Esmeralda | UB22001 | 100% |
| Rodas Velasquez, Fernanda Elizabeth | RV22043 | 100% |
Ciudad Universitaria, 10 de enero de 2025
Formación de la tabla
library(dplyr)
library(knitr)
library(kableExtra)
datos3 <- tibble::tribble(
~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
"Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
"Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
"Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
"Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
"Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
"Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
"Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
"Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
"Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
"Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
"Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
"Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
"Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
"Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
"Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
"Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
"Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
"La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)
datos3 %>% kbl(caption = "Tabla de Datos",
align = "c", digits = 3) %>%
kable_styling(html_font = "Times New Roman", bootstrap_options = "striped",
full_width = TRUE)| CC.AA. | Automovil | TV-Color | Video | Microondas | Lavajillas | Telefono |
|---|---|---|---|---|---|---|
| Espana | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| Andalucia | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragon | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Balerares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
| Cantabria | 63.4 | 94.9 | 48.9 | 36.5 | 11.2 | 80.5 |
| Castilla y Leon | 65.8 | 97.1 | 47.7 | 28.1 | 14.0 | 85.0 |
| Cast.-La Mancha | 61.5 | 97.3 | 53.6 | 21.7 | 7.1 | 72.9 |
| Cataluna | 70.4 | 98.1 | 71.1 | 36.8 | 19.8 | 92.2 |
| Com.-Valenciana | 72.7 | 98.4 | 68.2 | 26.6 | 12.1 | 84.4 |
| Extremadura | 60.5 | 97.7 | 43.7 | 20.7 | 11.7 | 67.1 |
| Galicia | 65.5 | 91.3 | 42.7 | 13.5 | 14.6 | 85.9 |
| Madrid | 74.0 | 99.4 | 76.3 | 53.9 | 32.3 | 95.7 |
| Murcia | 69.0 | 98.7 | 59.3 | 19.5 | 12.1 | 81.4 |
| Navarra | 76.4 | 99.3 | 60.6 | 44.0 | 20.6 | 87.4 |
| Pais Vasco | 71.3 | 98.3 | 61.6 | 45.7 | 23.7 | 94.3 |
| La Rioja | 64.9 | 98.6 | 54.4 | 44.4 | 17.6 | 83.4 |
Clustering jerárquico
# Crear matriz de distancias (excluyendo la columna "CC.AA.")
matriz.distancias <- dist(datos3[,-1], method = "euclidean")
hclust.average.caso3 <- hclust(matriz.distancias, method = "average")
plot(hclust.average.caso3, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Promedio)")Para encontrar el número óptimo de clusters
Datos.NbClust<- datos3[,c("Automovil","TV-Color","Video","Microondas","Lavajillas","Telefono")]
res.wardD2 <- NbClust(Datos.NbClust,distance = "euclidean", min.nc=2,max.nc=15, method = "ward.D2", index ="alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Método Ward
# Método Ward
hclust.ward <- hclust(matriz.distancias, method = "ward.D2")
plot(hclust.ward, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Ward)")Método Centroide
hclust.centroid <- hclust(matriz.distancias, method = "centroid")
plot(hclust.centroid, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Centroide)")Método Complete (Enlace completo)
hclust.complete <- hclust(matriz.distancias, method = "complete")
plot(hclust.complete, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Complete)")Método Single (Enlace simple)
hclust.single <- hclust(matriz.distancias, method = "single")
plot(hclust.single, labels = datos3$`CC.AA.`, main = "Clustering Jerárquico (Método Single)")Datos obtenidos del método de conglomeracióon Ward
Obtenemos los centroides
## Group.1 CC.AA. Automovil TV-Color Video Microondas Lavajillas Telefono
## 1 1 NA 66.87 96.82 57.68 25.42 11.81 80.71
## 2 2 NA 70.70 98.53 63.47 44.70 22.43 90.23
## grupo.ward
## 1 1
## 2 2
Salida donde la solución del jerárquico coincide con la del no jerárquico
c1<- c(66.87,96.82,56.01, 25.43,11.81,80.71)
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23)
datos.caso3.grupos.kmeans <- datos3[, -1]
solucion<- kmeans(datos.caso3.grupos.kmeans,rbind(c1,c2))
solucion## K-means clustering with 2 clusters of sizes 12, 6
##
## Cluster means:
## Automovil TV-Color Video Microondas Lavajillas Telefono
## 1 66.86667 96.81667 57.67500 25.425 11.80833 80.70833
## 2 70.70000 98.53333 63.46667 44.700 22.43333 90.23333
##
## Clustering vector:
## [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2810.6467 848.3533
## (between_SS / total_SS = 40.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
datos.caso3.grupos.kmeans$cluster <- factor(solucion$cluster)
t.test(Automovil ~ cluster,
data = datos.caso3.grupos.kmeans)##
## Welch Two Sample t-test
##
## data: Automovil by cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -8.5449256 0.8782589
## sample estimates:
## mean in group 1 mean in group 2
## 66.86667 70.70000