A33- Aplicación: Análisis de Clúster (Conglomerados ) [Trabajo Grupal]

UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONÓMICAS

ESCUELA DE ECONOMÍA CICLO II-2024

Tema: A33- Aplicación: Análisis de Clúster (Conglomerados ) [Trabajo Grupal]

Materia: Métodos para el Análisis Económico

Docente: MSF. Carlos Ademir Pérez Alas

GT: 03

Estudiantes:

Nombres y número de carnet
INTEGRANTES	CARNET
Sandra Maribel Aparicio Fuentes	AF22025
Nubia Linette Beltrán Hernández	BH21010
Génesis Melissa Siguenza Rivas	SR22041

Datos

library(kableExtra)
library(readxl)
A33 <- read_excel("C:/Users/MINEDUCYT/Downloads/A33/A33.xlsx", col_types = c("text", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric"))

A33 %>%kable(caption = "DATOS", align = "c",digits = 2) %>%  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

DATOS
CC.AA.	Automovil	TVcolor	Video	Microondas	Lavavajillas	Telefono
España	69.0	97.6	62.4	32.3	17.0	85.2
Andalucía	66.7	98.0	82.7	24.1	12.7	74.7
Aragón	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Baleares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y León	65.8	97.1	47.7	28.1	14.0	85.0
Cast.-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluña	70.4	98.1	71.1	36.8	19.8	92.2
Com. Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
País Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4

Optención de outliers

Análisis de la existencia de outliers en la medida en que pueden generar importantes distorsiones en la detección del número de grupos.

datas <- data.matrix(A33[, -1])

mean <- colMeans(datas)
Sx <- cov(datas)

D2 <- mahalanobis(datas, mean, Sx)

p_value <- pchisq(D2, df = ncol(datas), lower.tail = FALSE)

tabla_mahalanobis <- data.frame(
  CC.AA. = A33$CC.AA.,
  D2 = D2,
  p_value = p_value
)

tabla_mahalanobis %>% kable(caption = "D2 Y PVALUE", align = "c",digits = 2) %>%  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

D2 Y PVALUE
CC.AA.	D2	p_value
España	0.20	1.00
Andalucía	10.52	0.10
Aragón	1.91	0.93
Asturias	4.46	0.61
Baleares	5.70	0.46
Canarias	9.58	0.14
Cantabria	7.29	0.29
Castilla y León	2.21	0.90
Cast.-La Mancha	3.54	0.74
Cataluña	2.95	0.82
Com. Valenciana	2.65	0.85
Extremadura	10.43	0.11
Galicia	13.24	0.04
Madrid	8.31	0.22
Murcia	4.88	0.56
Navarra	7.65	0.26
País Vasco	2.32	0.89
La Rioja	4.17	0.65

Analisis de conglomerados jerarquicos

Realización de un análisis de conglomerados jerárquicos, evaluando la solución de distintos métodos de conglomeración, aplicando los criterios presentados para identificar el número adecuado de grupos y obtención de los centroides que han de servir de partida para el paso siguiente.

#Calculo distancia euclidea
matriz.dis.euclid.caso3<-dist(A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")], method="euclidean", diag = TRUE)

#cluster metodo average
hclust.average.caso3<-hclust(matriz.dis.euclid.caso3,method="average")
data.frame(hclust.average.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.339233      -1       3
## 5   9.387225     -16     -17
## 6  12.380548       1       5
## 7  12.448695      -9     -12
## 8  12.659790      -7       2
## 9  13.331971     -15       4
## 10 14.684330      -6       9
## 11 16.616569     -10       6
## 12 18.923865     -13       8
## 13 20.140313       7      12
## 14 22.109739      10      11
## 15 26.042489      13      14
## 16 31.452385      -2      15
## 17 39.104937     -14      16

plot(hclust.average.caso3,labels=A33$CC.AA.)

hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
data.frame(hclust.ward.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.387225     -16     -17
## 5   9.853764      -1       3
## 6  12.448695      -9     -12
## 7  14.238563      -7       2
## 8  15.114011     -15       5
## 9  15.686087     -10       4
## 10 16.592408      -6       8
## 11 19.584858       1       9
## 12 22.402976     -13       7
## 13 27.124565      -2      10
## 14 27.214089       6      12
## 15 30.523870     -14      11
## 16 50.718899      13      14
## 17 70.785686      15      16

plot (hclust.ward.caso3,labels=A33$CC.AA.)

hclust.single.caso3<-hclust(matriz.dis.euclid.caso3,method="single")
data.frame(hclust.single.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   8.426150      -1       3
## 5   9.387225     -16     -17
## 6   9.497368       1       5
## 7  10.520932      -7       2
## 8  11.179445      -6       4
## 9  12.345039     -15       8
## 10 12.448695      -9     -12
## 11 12.449498     -10       9
## 12 12.568612       6       7
## 13 12.748333      11      12
## 14 13.884884      10      13
## 15 15.236469     -13      14
## 16 17.449069      -2      15
## 17 19.176809     -14      16

plot (hclust.single.caso3,labels=A33$CC.AA.)

hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
data.frame(hclust.complete.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   9.387225     -16     -17
## 5  10.252317      -1       3
## 6  12.448695      -9     -12
## 7  14.598288     -15       5
## 8  14.798649      -7       2
## 9  14.940214     -10       4
## 10 17.063704      -6       7
## 11 21.199764       1       9
## 12 21.923731     -13       6
## 13 25.010598       8      12
## 14 25.558951      -2      10
## 15 31.948239     -14      11
## 16 43.448130      13      14
## 17 59.937134      15      16

plot(hclust.complete.caso3,labels=A33$CC.AA.)

hclust.centroid.caso3<-hclust(matriz.dis.euclid.caso3,method="centroid")
data.frame(hclust.centroid.caso3[2:1])

##       height merge.1 merge.2
## 1   6.874591      -3     -18
## 2   7.153321      -4      -8
## 3   7.805767      -5     -11
## 4   7.387792      -1       3
## 5   9.387225     -16     -17
## 6   8.315094       1       5
## 7  10.389278     -15       4
## 8  10.529321      -6       7
## 9  10.871460      -7       2
## 10 10.543548       8       9
## 11 11.542734       6      10
## 12 11.284691     -10      11
## 13 12.448695      -9     -12
## 14 16.023631      12      13
## 15 18.607642     -13      14
## 16 21.157165      -2      15
## 17 28.243987     -14      16

plot(hclust.centroid.caso3,labels=A33$CC.AA.)

Segun los dendogramas observamos que los metodos mas parecidos entre si son los: ward, complete y average.

Ward

library (NbClust)
Datos.NbClust<-A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")]

res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15 , method = "ward.D2" , index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

library (NbClust)
Datos.NbClust<-A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")]

res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15 , method = "complete" , index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

library (NbClust)
Datos.NbClust<-A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")]

res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15 , method = "average" , index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 5 proposed 4 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  15 
##  
##  
## *******************************************************************

2 conglomerados

Centroides

es la media de las seir vatiables analizadas en cada uno de los grupos obtenidos.

library(dplyr)
library(kableExtra)
grupo.ward<-cutree(hclust.ward.caso3, k =2, h=NULL)

datos.caso3.grupos<-cbind(A33,grupo.ward) 
datos.caso3.grupos$CC.AA. <- NULL

round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3
head(datos.caso3)%>% kable(caption = "Grupos", align = "c",digits = 2) %>%  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

Grupos
Group.1	Automovil	TVcolor	Video	Microondas	Lavavajillas	Telefono	grupo.ward
1	66.87	96.82	57.68	25.42	11.81	80.71	1
2	70.70	98.53	63.47	44.70	22.43	90.23	2

Análisis de conglomerados no jerárquico mediante el método de k-medias.

Realización de un análisis de conglomerados no jerárquico mediante el método de k-medias para la obtención de una solución óptima en términos de homogeneidad intrasegmentos y heterogeneidad intersegmentos.

Estimas el analisis de conglomerados no jerarquicos tomando como centroides iniciales los anteriores.

library(stats)
c1<-c(66.87,96.82,56.01 ,25.43,11.81,80.71) 
c2<-c(70.70,98.53,63.47,44.70,22.43,90.23) 
solucion <- kmeans(datas,rbind(c1,c2))
 
solucion

## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automovil  TVcolor    Video Microondas Lavavajillas Telefono
## 1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

coincide la solucion del jerarquico con la del no jerarquico.

Dendogramas

hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")

plot(hclust.ward.caso3,labels=A33$CC.AA.)

#Para ponerlo por grupos

grupos <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "red")

hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")

plot(hclust.complete.caso3,labels=A33$CC.AA.)

#Para ponerlo por grupos

grupos <- cutree(hclust.complete.caso3, k = 2)
rect.hclust(hclust.complete.caso3, k = 2, border = "red")

hclust.average.caso3<-hclust(matriz.dis.euclid.caso3,method="average")

plot(hclust.average.caso3,labels=A33$CC.AA.)

# Para ponerlo por grupos

grupos <- cutree(hclust.average.caso3, k = 2)
rect.hclust(hclust.average.caso3, k = 2, border = "red")

Prueba t para cada variable independiente

Para saber cuales son significativamentediferentes entre los grupos

library(dplyr)
library(kableExtra)
solucion.cluster<-solucion$cluster
t1<-t.test(Automovil~solucion.cluster, data = datas)
t2<-t.test(TVcolor~solucion.cluster, data = datas)
t3<-t.test(Video~solucion.cluster, data = datas)
t4<-t.test(Microondas~solucion.cluster, data = datas)
t5<-t.test(Lavavajillas~solucion.cluster, data = datas)
t6<-t.test(Telefono~solucion.cluster, data = datas)

resultados.ttest <- data.frame(
  Variable = c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono"),
  Pruebast = c(t1$statistic, t2$statistic, t3$statistic, t4$statistic, t5$statistic, t6$statistic),
  Grupo1=c(t1$estimate[1],t2$estimate[1],t3$estimate[1],t4$estimate[1],t5$estimate[1],t6$estimate[1]),
    Grupo2=c(t1$estimate[2],t2$estimate[2],t3$estimate[2],t4$estimate[2],t5$estimate[2],t6$estimate[2])
)

head(resultados.ttest)%>% kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados", align = "c",digits = 2) %>%  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

Significatividad de las diferencias entre los perfiles de los conglomerados
Variable	Pruebast	Grupo1	Grupo2
Automovil	-1.81	66.87	70.70
TVcolor	-2.52	96.82	98.53
Video	-1.19	57.68	63.47
Microondas	-6.73	25.42	44.70
Lavavajillas	-4.61	11.81	22.43
Telefono	-3.51	80.71	90.23

Se observa que para Automovil el primer grupo presenta el 66.87% y el grupo 2 un 70.70%. Pero no es muy significativa por que t=-1.81, p > 0.05. de esta manera se analizan las demas variables.

Se concluye que el grupo 2 se corresponde con counidades autonomas donde los equipamienos son significativos a comparacion con el grupo 1.

Visualizacion

library(cluster)  
library(factoextra) 
mydata <- data.frame(datas, solucion$cluster)

# Cambiar el nombre de la columna de clusters 
colnames(mydata)[ncol(mydata)] <- "Cluster"

# Visualizar el resultado 
clusplot(mydata[,1:(ncol(mydata)-1)], 
         mydata$Cluster, 
         color = TRUE, 
         shade = TRUE,
         labels = 2, 
         lines = 0, 
         main = "Clusters K-means",
         xlab = "Componente 1", 
         ylab = "Componente 2")

#Grafico mejorado con factoextra
fviz_cluster(solucion, data = datas,
             palette = c("#E24229", "#29ABE2"),
             geom = c("point", "text"),
             ggtheme = theme_bw())