“A33 - Aplicación Análisis de Clúster (Conglomerados)”

UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONOMICAS

ESCUELA DE ECONOMIA


Métodos para el Análisis Económico

GT03 CICLO II/2024

CATEDRATICO: Carlos Ademir Perez Alas

Guía: “Aplicación Análisis de Clúster (Conglomerados)”

Integrantes:

Cortez Hércules, José Vladimir CH21018

Deodanes Abrego, Cristian Jeferson DA21012

Linares Rodriguez, Walter Jose LR21023

Ciudad Universitaria, 7 Enero de 2025.

Cuadro 3.22

library(stats)
library(NbClust)
library(kableExtra)
library(MASS)

# Datos del Cuadro 3.22
Datos_3_3_Caso <- data.frame(
  CC.AA. = as.character(c("España", "Andalucía", "Aragón", "Asturias", "Balerares", "Canarias", "Cantabria",
                             "Castilla y León", "Cast.-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura",
                             "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja")),
  Automóvil = as.numeric(c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9)),
  TV_Color = as.numeric(c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6)),
  Video = as.numeric(c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4)),
  Microondas = as.numeric(c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4)),
  Lavavajillas = as.numeric(c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6)),
  Teléfono = as.numeric(c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4))
)
kable_classic(kable(Datos_3_3_Caso))
CC.AA. Automóvil TV_Color Video Microondas Lavavajillas Teléfono
España 69.0 97.6 62.4 32.3 17.0 85.2
Andalucía 66.7 98.0 82.7 24.1 12.7 74.7
Aragón 67.2 97.5 56.8 43.4 20.6 88.4
Asturias 63.7 95.2 52.1 24.4 13.3 88.1
Balerares 71.9 98.8 62.4 29.8 10.1 87.9
Canarias 72.7 96.8 68.4 27.9 5.8 75.4
Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
Castilla y León 65.8 97.1 47.7 28.1 14.0 85.0
Cast.-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
Cataluña 70.4 98.1 71.1 36.8 19.8 92.2
Com. Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
Galicia 65.5 91.3 42.7 13.5 14.6 85.9
Madrid 74.0 99.4 76.3 53.9 32.3 95.7
Murcia 69.0 98.7 59.3 19.5 12.1 81.4
Navarra 76.4 99.3 60.6 44.0 20.6 87.4
País Vasco 71.3 98.3 61.6 45.7 23.7 94.3
La Rioja 64.9 98.6 54.4 44.4 17.6 83.4

Cuadro 3.23

library(dplyr)
Datos.NbClust <- Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")]
mean<-colMeans(Datos.NbClust) 
Sx<-cov(Datos.NbClust) 
D2<-mahalanobis(Datos.NbClust,mean,Sx, inverted = FALSE) 
pvalue=pchisq(D2 , df=6, lower.tail=FALSE) 
data0=data.frame(D2,pvalue)
data0=cbind(data0,Datos_3_3_Caso$CC.AA.)
names(data0)=c("D2","pvalue","CC.AA.")
data0=select(data0,CC.AA.,D2,pvalue)
kable_classic(kable(data0))
CC.AA. D2 pvalue
España 0.1957042 0.9998549
Andalucía 10.5207676 0.1043658
Aragón 1.9097471 0.9278091
Asturias 4.4628800 0.6142959
Balerares 5.7026801 0.4573061
Canarias 9.5791345 0.1435313
Cantabria 7.2910646 0.2947664
Castilla y León 2.2064805 0.8997629
Cast.-La Mancha 3.5390966 0.7387610
Cataluña 2.9473889 0.8154205
Com. Valenciana 2.6469747 0.8516698
Extremadura 10.4333455 0.1075495
Galicia 13.2415547 0.0393564
Madrid 8.3056697 0.2165535
Murcia 4.8751556 0.5599224
Navarra 7.6517494 0.2647430
País Vasco 2.3173225 0.8883259
La Rioja 4.1732837 0.6532394

Figura 3.11

matriz.dis.euclid.caso3= dist(Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")], method="euclidean", diag=TRUE)

hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
plot (hclust.ward.caso3,labels=Datos_3_3_Caso$CC.AA.)

hclust.average.caso3<-hclust(matriz.dis.euclid.caso3, method="average")
plot (hclust.average.caso3,labels=Datos_3_3_Caso$CC.AA.)

data.frame(hclust.average.caso3[2:1]) 
##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.339233      -1       3
## 5   9.387225     -16     -17
## 6  12.380548       1       5
## 7  12.448695      -9     -12
## 8  12.659790      -7       2
## 9  13.331971     -15       4
## 10 14.684330      -6       9
## 11 16.616569     -10       6
## 12 18.923865     -13       8
## 13 20.140313       7      12
## 14 22.109739      10      11
## 15 26.042489      13      14
## 16 31.452385      -2      15
## 17 39.104937     -14      16
hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
plot (hclust.complete.caso3,labels=Datos_3_3_Caso$CC.AA.)

hclust.single.caso3<-hclust(matriz.dis.euclid.caso3,method="single")
plot (hclust.single.caso3,labels=Datos_3_3_Caso$CC.AA.)

hclust.centroid.caso3<-hclust(matriz.dis.euclid.caso3,method="centroid")
plot (hclust.centroid.caso3,labels=Datos_3_3_Caso$CC.AA.)

Figura 3.12

library(NbClust)
Datos.NbClust <- Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")]

# Ejecutar NbClust para obtener el índice CCC con diferentes métodos
res.average <- NbClust(Datos.NbClust, distance = "euclidean", min.nc = 2, max.nc = 15, method = "average", index = "ccc")
res.complete <- NbClust(Datos.NbClust, distance = "euclidean", min.nc = 2, max.nc = 15, method = "complete", index = "ccc")
res.ward <- NbClust(Datos.NbClust, distance = "euclidean", min.nc = 2, max.nc = 15, method = "ward.D2", index = "ccc")

# Extraer los valores de CCC
clusters <- 2:15
ccc_average <- res.average$All.index
ccc_complete <- res.complete$All.index
ccc_ward <- res.ward$All.index

# Crear un dataframe con los resultados
df <- data.frame(
  cluster = clusters,
  average = ccc_average,
  complete = ccc_complete,
  ward = ccc_ward
)

# Transformar a formato largo
library(tidyr)
df_long <- pivot_longer(df, cols = -cluster, names_to = "método", values_to = "CCC")

# Graficar con ggplot
library(ggplot2)
ggplot(df_long, aes(x = cluster, y = CCC, color = método, linetype = método)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    title = "Proceso de determinación del centróide inicial",
    x = "Número de Clusters",
    y = "CCC",
    caption = "Figura 3.12"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.title = element_blank()
  )

Datos.NbClust<-Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")]

res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean" , 
min.nc=2, max.nc=15 , method = "ward.D2" , index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Cuadro 3.24

grupo.ward<-cutree(hclust.ward.caso3, k = 2, h = NULL) 
datos.caso3.grupos<-cbind(Datos_3_3_Caso,grupo.ward) 
datos.caso3.grupos$CC.AA. <-NULL 
round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3
print(datos.caso3)
##   Group.1 Automóvil TV_Color Video Microondas Lavavajillas Teléfono grupo.ward
## 1       1     66.87    96.82 57.68      25.42        11.81    80.71          1
## 2       2     70.70    98.53 63.47      44.70        22.43    90.23          2
datos_kmeans <- Datos_3_3_Caso[, -1]

# Definir los centroides iniciales
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)

set.seed(123)
solucion <- kmeans(datos_kmeans, centers = rbind(c1, c2)) |> print( )
## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automóvil TV_Color    Video Microondas Lavavajillas Teléfono
## 1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Figura 3.13

hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
plot (hclust.ward.caso3,labels=Datos_3_3_Caso$CC.AA.)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k=2, border="red")

hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
plot(hclust.average.caso3, labels = Datos_3_3_Caso$CC.AA.)
groups <- cutree(hclust.average.caso3, k = 2)
rect.hclust(hclust.average.caso3, k = 2, border = "red")

hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
plot (hclust.complete.caso3,labels=Datos_3_3_Caso$CC.AA.)
groups <- cutree(hclust.complete.caso3, k = 2)
rect.hclust(hclust.complete.caso3, k=2, border="red")

Cuadro 3.25

library(psych)
DatosCaso3.1b = Datos.NbClust
#efectuamos eL cLuster con metodo centroide 
kmeans.caso3.1<-kmeans(DatosCaso3.1b, 2) 
#obtenemos Las medias 
aggregate(DatosCaso3.1b,by= 
list(kmeans.caso3.1$cluster),FUN=mean) 
##   Group.1 Automóvil TV_Color    Video Microondas Lavavajillas Teléfono
## 1       1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2       2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
#adicionamos La pertenencia aL cLuster
DatosCaso3.1b=data.frame(DatosCaso3.1b, kmeans.caso3.1$cluster)  
kmeans.caso3.1.cluster=kmeans.caso3.1$cluster

t.test(Automóvil ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b) 
## 
##  Welch Two Sample t-test
## 
## data:  Automóvil by kmeans.caso3.1.cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -8.5449256  0.8782589
## sample estimates:
## mean in group 1 mean in group 2 
##        66.86667        70.70000

Cuadro 3.26

t.test(Automóvil ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b) -> t1
t.test(TV_Color ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b) -> t2
t.test(Video  ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t3
t.test(Microondas ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t4
t.test(Lavavajillas ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t5
t.test(Teléfono ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t6

pruebas.t=as.data.frame(c(t1$statistic,t2$statistic,t3$statistic,t4$statistic,t5$statistic,t6$statistic)) 

datos33<-as.data.frame( datos.caso3)
datos33$grupo.ward <- NULL 
datos33$Group.1 <- NULL
datos33=t(datos33) 
names(pruebas.t)=c("pruebas t")
tados33=cbind(datos33,pruebas.t) 
names(tados33)=c("grupo 1","grupo 2","pruebas t")


add_footnote(kable_classic(kable(head(tados33),caption = " Significatividad de las diferencias entre los perfiles de los conglo merados "),html_font = "Arial",font_size=14 ),label = "**p < o, 01 ;* p < o, 05 ", notation ="symbol")
Significatividad de las diferencias entre los perfiles de los conglo merados
grupo 1 grupo 2 pruebas t
Automóvil 66.87 70.70 -1.810598
TV_Color 96.82 98.53 -2.515162
Video 57.68 63.47 -1.188830
Microondas 25.42 44.70 -6.734076
Lavavajillas 11.81 22.43 -4.605368
Teléfono 80.71 90.23 -3.507692
* **p < o, 01 ;* p < o, 05
library(cluster)
library(fpc)

aggregate( Datos.NbClust,by= list(kmeans.caso3.1$cluster),FUN=mean)
##   Group.1 Automóvil TV_Color    Video Microondas Lavavajillas Teléfono
## 1       1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2       2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
mydata <- data.frame( Datos.NbClust,kmeans.caso3.1$cluster)
fit <- kmeans(mydata, 2)
library(cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
   labels=2, lines=0)

Figura 3.14

library(ggrepel)

datos_numericos <- Datos_3_3_Caso[, -1]

pca <- prcomp(datos_numericos, scale = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$Grupo <- as.factor(groups)
pca_data <- data.frame(
  Componente1 = pca$x[, 1],
  Componente2 = pca$x[, 2],
  CC.AA. = Datos_3_3_Caso$CC.AA.
)

destacadas <- c("Madrid", "Cataluña", "Navarra", "País Vasco", "La Rioja", "Aragón")

ggplot(pca_data, aes(x = Componente1, y = Componente2, label = CC.AA.)) +
  geom_point(color = "black", size = 3) +
  geom_text_repel(aes(color = CC.AA. %in% destacadas), size = 3) +
  geom_point(data = subset(pca_data, CC.AA. %in% destacadas),
             aes(x = Componente1, y = Componente2), color = "red", shape = 17, size = 4) +
  scale_color_manual(values = c("black", "red"), guide = "none") +
  theme_minimal() +
  labs(
    x = "Componente Principal 1",
    y = "Componente Principal 2",
    title = "Análisis de Componentes Principales (PCA)"
  )