A33: Aplicación: Análisis de Clúster (Conglomerados)

UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONÓMICAS

ESCUELA DE ECONOMÍA

MÉTODOS PARA EL ÁNALISIS ECONÓMICO

TEMA:

“Aplicación: Análisis de Clúster (Conglomerados)”

DOCENTE:

MSF. Carlos Ademir Pérez Alas.

Grupo de Trabajo

Grupo 06

Integrantes	Carnet	Participación
Martinez Alfaro Kelly Jeannette	MP21084	100%
Méndez Pacheco Darleen Ivette	MP21084	100%
Zarpate Crissia Margareth Villalta	MB22006	100%

CIUDAD UNIVERSITARIA, VIERNES 10 DE ENERO DE 2025

options(scipen = 999999)
knitr::opts_chunk$set(warning=FALSE,echo=TRUE,message=FALSE,eval=TRUE)

3.7. Un ejemplo de la aplicación del análisis de conglomerados

Datos_3_3_Caso <- data.frame(
    CC.AA. = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
    automovil = c(69.0, 66.7, 67.2, 63.7, 71.6, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.0, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
    tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 91.8, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
    video = c(62.4, 82.7, 56.8, 52.5, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
    microond = c(32.3, 24.1, 43.4, 21.4, 29.8, 47.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
    lavavaji = c(17.0, 12.7, 20.6, 13.3, 14.4, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.1, 4.6, 32.3, 12.1, 20.6, 23.7, 17.6),
    telefono = c(85.2, 74.7, 88.4, 86.4, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
    )

matriz_datos <- as.matrix(Datos_3_3_Caso[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")])
matriz.dis.euclid.caso3 <- dist(matriz_datos, method = "euclidean")

datos.caso3.grupos.kmeans <- kmeans(Datos_3_3_Caso[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")], 2)

Datos_3_3_Caso$kmeans.caso3.cluster <- datos.caso3.grupos.kmeans$cluster

fit.automovil <- aov(automovil ~ kmeans.caso3.cluster, data = Datos_3_3_Caso)

fit.tvcolor <- aov(tvcolor ~ kmeans.caso3.cluster, data = Datos_3_3_Caso)

fit.video <- aov(video ~ kmeans.caso3.cluster,data = Datos_3_3_Caso)

fit.microond <- aov(microond ~ kmeans.caso3.cluster, data = Datos_3_3_Caso)

fit.lavavaji <- aov(lavavaji ~ kmeans.caso3.cluster, data = Datos_3_3_Caso)

fit.telefono <- aov(telefono ~ kmeans.caso3.cluster, data = Datos_3_3_Caso)

# Mostrar resultados del ANOVA

summary(fit.automovil)

##                      Df Sum Sq Mean Sq F value   Pr(>F)    
## kmeans.caso3.cluster  1  180.7  180.66   16.96 0.000805 ***
## Residuals            16  170.4   10.65                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit.tvcolor)

##                      Df Sum Sq Mean Sq F value  Pr(>F)   
## kmeans.caso3.cluster  1  40.29   40.29   12.12 0.00308 **
## Residuals            16  53.18    3.32                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit.video)

##                      Df Sum Sq Mean Sq F value   Pr(>F)    
## kmeans.caso3.cluster  1 1112.8  1112.8   19.35 0.000448 ***
## Residuals            16  919.9    57.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit.microond)

##                      Df Sum Sq Mean Sq F value  Pr(>F)   
## kmeans.caso3.cluster  1   1086  1086.1    13.9 0.00183 **
## Residuals            16   1250    78.1                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit.lavavaji)

##                      Df Sum Sq Mean Sq F value Pr(>F)  
## kmeans.caso3.cluster  1  233.4  233.43   6.799  0.019 *
## Residuals            16  549.3   34.33                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit.telefono)

##                      Df Sum Sq Mean Sq F value Pr(>F)  
## kmeans.caso3.cluster  1  174.5  174.51   3.572  0.077 .
## Residuals            16  781.7   48.85                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Datos.NbClust <- as.matrix(Datos_3_3_Caso[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")])

hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,
method = "average")
data.frame(hclust.average.caso3[2:1])

##       height merge.1 merge.2
## 1   5.338539      -1      -5
## 2   6.874591      -3     -18
## 3   8.854377      -4      -8
## 4   9.088165     -11       1
## 5   9.387225     -16     -17
## 6  12.380548       2       5
## 7  13.111610     -15       3
## 8  13.459198      -9     -12
## 9  13.692195     -10       4
## 10 16.116750      -7       7
## 11 18.315426       6       9
## 12 19.176262       8      10
## 13 20.682296     -13      12
## 14 23.782645      -6      11
## 15 27.887869      13      14
## 16 32.206064      -2      15
## 17 38.876263     -14      16

Datos.NbClust <- Datos_3_3_Caso[,c ("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")]

library(NbClust)
res.wardD2 <- NbClust(Datos.NbClust,
                distance = "euclidean",
                min.nc = 2,
                max.nc = 15,
                method = "ward.D2",
                index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 10 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 7 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

Datos_3_3_Caso<- data.frame(matrix(rnorm(18*20), ncol=20))
matriz.dis.euclid <- dist(Datos_3_3_Caso, method="euclidean")
hclust.ward.caso3 <- hclust(matriz.dis.euclid, method="ward.D2")
grupo.ward <- cutree(hclust.ward.caso3, k = 2)

grupo.ward <- cutree(hclust.ward.caso3, k = 2, h = NULL)
datos.caso3.grupos <- cbind(Datos_3_3_Caso, grupo.ward)
datos.caso3.grupos$id <- NULL

round(aggregate(datos.caso3.grupos, list(grupo.ward), mean),2)

##   Group.1   X1    X2   X3    X4    X5   X6    X7    X8    X9  X10   X11   X12
## 1       1 0.08  0.02 0.13 -0.28  0.48 0.01 -0.13 -0.62 -0.24 0.18  0.29  0.05
## 2       2 0.31 -0.11 0.98  0.57 -0.84 0.41  0.15  0.05  1.27 0.47 -0.56 -0.87
##     X13   X14   X15  X16   X17   X18  X19   X20 grupo.ward
## 1  0.05 -0.20  0.34 0.08  0.42 -0.11 0.05  0.07          1
## 2 -0.50  0.57 -1.26 0.49 -0.20  0.32 0.50 -0.66          2

c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)