“A33 - Aplicación Análisis de Clúster (Conglomerados)”

UNIVERSIDAD DE EL SALVADOR
FACULTAD DE CIENCIAS ECONOMICAS
ESCUELA DE ECONOMIA

Métodos para el Análisis Económico
GT03 CICLO II/2024
CATEDRATICO: Carlos Ademir Perez Alas
Guía: “Aplicación Análisis de Clúster (Conglomerados)”
Integrantes:
Cortez Hércules, José Vladimir CH21018
Deodanes Abrego, Cristian Jeferson DA21012
Linares Rodriguez, Walter Jose LR21023
Ciudad Universitaria, 7 Enero de 2025.

Cuadro 3.22

library(stats)
library(NbClust)
library(kableExtra)
library(MASS)

# Datos del Cuadro 3.22
Datos_3_3_Caso <- data.frame(
  CC.AA. = as.character(c("España", "Andalucía", "Aragón", "Asturias", "Balerares", "Canarias", "Cantabria",
                             "Castilla y León", "Cast.-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura",
                             "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja")),
  Automóvil = as.numeric(c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9)),
  TV_Color = as.numeric(c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6)),
  Video = as.numeric(c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4)),
  Microondas = as.numeric(c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4)),
  Lavavajillas = as.numeric(c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6)),
  Teléfono = as.numeric(c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4))
)
kable_classic(kable(Datos_3_3_Caso))

CC.AA.	Automóvil	TV_Color	Video	Microondas	Lavavajillas	Teléfono
España	69.0	97.6	62.4	32.3	17.0	85.2
Andalucía	66.7	98.0	82.7	24.1	12.7	74.7
Aragón	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Balerares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y León	65.8	97.1	47.7	28.1	14.0	85.0
Cast.-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluña	70.4	98.1	71.1	36.8	19.8	92.2
Com. Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
País Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4

Cuadro 3.23

library(dplyr)
Datos.NbClust <- Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")]
mean<-colMeans(Datos.NbClust) 
Sx<-cov(Datos.NbClust) 
D2<-mahalanobis(Datos.NbClust,mean,Sx, inverted = FALSE) 
pvalue=pchisq(D2 , df=6, lower.tail=FALSE) 
data0=data.frame(D2,pvalue)
data0=cbind(data0,Datos_3_3_Caso$CC.AA.)
names(data0)=c("D2","pvalue","CC.AA.")
data0=select(data0,CC.AA.,D2,pvalue)
kable_classic(kable(data0))

CC.AA.	D2	pvalue
España	0.1957042	0.9998549
Andalucía	10.5207676	0.1043658
Aragón	1.9097471	0.9278091
Asturias	4.4628800	0.6142959
Balerares	5.7026801	0.4573061
Canarias	9.5791345	0.1435313
Cantabria	7.2910646	0.2947664
Castilla y León	2.2064805	0.8997629
Cast.-La Mancha	3.5390966	0.7387610
Cataluña	2.9473889	0.8154205
Com. Valenciana	2.6469747	0.8516698
Extremadura	10.4333455	0.1075495
Galicia	13.2415547	0.0393564
Madrid	8.3056697	0.2165535
Murcia	4.8751556	0.5599224
Navarra	7.6517494	0.2647430
País Vasco	2.3173225	0.8883259
La Rioja	4.1732837	0.6532394

Figura 3.11

matriz.dis.euclid.caso3= dist(Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")], method="euclidean", diag=TRUE)

hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
plot (hclust.ward.caso3,labels=Datos_3_3_Caso$CC.AA.)

hclust.average.caso3<-hclust(matriz.dis.euclid.caso3, method="average")
plot (hclust.average.caso3,labels=Datos_3_3_Caso$CC.AA.)

data.frame(hclust.average.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.339233      -1       3
## 5   9.387225     -16     -17
## 6  12.380548       1       5
## 7  12.448695      -9     -12
## 8  12.659790      -7       2
## 9  13.331971     -15       4
## 10 14.684330      -6       9
## 11 16.616569     -10       6
## 12 18.923865     -13       8
## 13 20.140313       7      12
## 14 22.109739      10      11
## 15 26.042489      13      14
## 16 31.452385      -2      15
## 17 39.104937     -14      16

hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
plot (hclust.complete.caso3,labels=Datos_3_3_Caso$CC.AA.)

hclust.single.caso3<-hclust(matriz.dis.euclid.caso3,method="single")
plot (hclust.single.caso3,labels=Datos_3_3_Caso$CC.AA.)

hclust.centroid.caso3<-hclust(matriz.dis.euclid.caso3,method="centroid")
plot (hclust.centroid.caso3,labels=Datos_3_3_Caso$CC.AA.)

Figura 3.12

library(NbClust)
Datos.NbClust <- Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")]

# Ejecutar NbClust para obtener el índice CCC con diferentes métodos
res.average <- NbClust(Datos.NbClust, distance = "euclidean", min.nc = 2, max.nc = 15, method = "average", index = "ccc")
res.complete <- NbClust(Datos.NbClust, distance = "euclidean", min.nc = 2, max.nc = 15, method = "complete", index = "ccc")
res.ward <- NbClust(Datos.NbClust, distance = "euclidean", min.nc = 2, max.nc = 15, method = "ward.D2", index = "ccc")

# Extraer los valores de CCC
clusters <- 2:15
ccc_average <- res.average$All.index
ccc_complete <- res.complete$All.index
ccc_ward <- res.ward$All.index

# Crear un dataframe con los resultados
df <- data.frame(
  cluster = clusters,
  average = ccc_average,
  complete = ccc_complete,
  ward = ccc_ward
)

# Transformar a formato largo
library(tidyr)
df_long <- pivot_longer(df, cols = -cluster, names_to = "método", values_to = "CCC")

# Graficar con ggplot
library(ggplot2)
ggplot(df_long, aes(x = cluster, y = CCC, color = método, linetype = método)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    title = "Proceso de determinación del centróide inicial",
    x = "Número de Clusters",
    y = "CCC",
    caption = "Figura 3.12"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.title = element_blank()
  )

Datos.NbClust<-Datos_3_3_Caso[, c("Automóvil", "TV_Color", "Video", "Microondas", "Lavavajillas", "Teléfono")]

res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean" , 
min.nc=2, max.nc=15 , method = "ward.D2" , index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Cuadro 3.24

grupo.ward<-cutree(hclust.ward.caso3, k = 2, h = NULL) 
datos.caso3.grupos<-cbind(Datos_3_3_Caso,grupo.ward) 
datos.caso3.grupos$CC.AA. <-NULL 
round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3
print(datos.caso3)

##   Group.1 Automóvil TV_Color Video Microondas Lavavajillas Teléfono grupo.ward
## 1       1     66.87    96.82 57.68      25.42        11.81    80.71          1
## 2       2     70.70    98.53 63.47      44.70        22.43    90.23          2

datos_kmeans <- Datos_3_3_Caso[, -1]

# Definir los centroides iniciales
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)

set.seed(123)
solucion <- kmeans(datos_kmeans, centers = rbind(c1, c2)) |> print( )

## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automóvil TV_Color    Video Microondas Lavavajillas Teléfono
## 1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Figura 3.13

hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
plot (hclust.ward.caso3,labels=Datos_3_3_Caso$CC.AA.)
groups <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k=2, border="red")

hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
plot(hclust.average.caso3, labels = Datos_3_3_Caso$CC.AA.)
groups <- cutree(hclust.average.caso3, k = 2)
rect.hclust(hclust.average.caso3, k = 2, border = "red")

hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
plot (hclust.complete.caso3,labels=Datos_3_3_Caso$CC.AA.)
groups <- cutree(hclust.complete.caso3, k = 2)
rect.hclust(hclust.complete.caso3, k=2, border="red")

Cuadro 3.25

library(psych)
DatosCaso3.1b = Datos.NbClust
#efectuamos eL cLuster con metodo centroide 
kmeans.caso3.1<-kmeans(DatosCaso3.1b, 2) 
#obtenemos Las medias 
aggregate(DatosCaso3.1b,by= 
list(kmeans.caso3.1$cluster),FUN=mean)

##   Group.1 Automóvil TV_Color    Video Microondas Lavavajillas Teléfono
## 1       1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2       2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333

#adicionamos La pertenencia aL cLuster
DatosCaso3.1b=data.frame(DatosCaso3.1b, kmeans.caso3.1$cluster)  
kmeans.caso3.1.cluster=kmeans.caso3.1$cluster

t.test(Automóvil ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)

## 
##  Welch Two Sample t-test
## 
## data:  Automóvil by kmeans.caso3.1.cluster
## t = -1.8106, df = 10.091, p-value = 0.1
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -8.5449256  0.8782589
## sample estimates:
## mean in group 1 mean in group 2 
##        66.86667        70.70000

Cuadro 3.26

t.test(Automóvil ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b) -> t1
t.test(TV_Color ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b) -> t2
t.test(Video  ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t3
t.test(Microondas ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t4
t.test(Lavavajillas ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t5
t.test(Teléfono ~ kmeans.caso3.1.cluster,data = DatosCaso3.1b)-> t6

pruebas.t=as.data.frame(c(t1$statistic,t2$statistic,t3$statistic,t4$statistic,t5$statistic,t6$statistic)) 

datos33<-as.data.frame( datos.caso3)
datos33$grupo.ward <- NULL 
datos33$Group.1 <- NULL
datos33=t(datos33) 
names(pruebas.t)=c("pruebas t")
tados33=cbind(datos33,pruebas.t) 
names(tados33)=c("grupo 1","grupo 2","pruebas t")


add_footnote(kable_classic(kable(head(tados33),caption = " Significatividad de las diferencias entre los perfiles de los conglo merados "),html_font = "Arial",font_size=14 ),label = "**p < o, 01 ;* p < o, 05 ", notation ="symbol")

Significatividad de las diferencias entre los perfiles de los conglo merados
	grupo 1	grupo 2	pruebas t
Automóvil	66.87	70.70	-1.810598
TV_Color	96.82	98.53	-2.515162
Video	57.68	63.47	-1.188830
Microondas	25.42	44.70	-6.734076
Lavavajillas	11.81	22.43	-4.605368
Teléfono	80.71	90.23	-3.507692
^* *p < o, 01 ; p < o, 05

library(cluster)
library(fpc)

aggregate( Datos.NbClust,by= list(kmeans.caso3.1$cluster),FUN=mean)

##   Group.1 Automóvil TV_Color    Video Microondas Lavavajillas Teléfono
## 1       1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2       2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333

mydata <- data.frame( Datos.NbClust,kmeans.caso3.1$cluster)
fit <- kmeans(mydata, 2)
library(cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
   labels=2, lines=0)

Figura 3.14

library(ggrepel)

datos_numericos <- Datos_3_3_Caso[, -1]

pca <- prcomp(datos_numericos, scale = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$Grupo <- as.factor(groups)
pca_data <- data.frame(
  Componente1 = pca$x[, 1],
  Componente2 = pca$x[, 2],
  CC.AA. = Datos_3_3_Caso$CC.AA.
)

destacadas <- c("Madrid", "Cataluña", "Navarra", "País Vasco", "La Rioja", "Aragón")

ggplot(pca_data, aes(x = Componente1, y = Componente2, label = CC.AA.)) +
  geom_point(color = "black", size = 3) +
  geom_text_repel(aes(color = CC.AA. %in% destacadas), size = 3) +
  geom_point(data = subset(pca_data, CC.AA. %in% destacadas),
             aes(x = Componente1, y = Componente2), color = "red", shape = 17, size = 4) +
  scale_color_manual(values = c("black", "red"), guide = "none") +
  theme_minimal() +
  labs(
    x = "Componente Principal 1",
    y = "Componente Principal 2",
    title = "Análisis de Componentes Principales (PCA)"
  )