Ejemplo 3.7 Aplicación de Análisis Conglomerados

Caso: Diseño de un plan de incentivos de vendedores

El ejercicio aborda una decisión estratégica para el director de ventas de una cadena de tiendas de electrodomésticos con cobertura nacional. La necesidad surge de la dificultad de establecer un plan de incentivos equitativo; se requiere que los incentivos sean más altos en aquellas zonas geográficas donde las condiciones de vida de los habitantes hacen más difícil las ventas.

El propósito es determinar si las Comunidades Autónomas (CC.AA.) pueden segmentarse en grupos homogéneos con base en seis indicadores de equipamiento de los hogares (Automóvil, TV color, Vídeo, Microondas, Lavavajillas y Teléfono).

Datos_3_3_Caso <- data.frame(
  CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", 
           "Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña",
           "Comv. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
           "Navarra", "País Vasco", "La Rioja"),
 Automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 65.1, 
               70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
 TV_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 
              98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
 Video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 
           71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
           Microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 
                          36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
 Lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 
                  19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
 Telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 
              92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
print(Datos_3_3_Caso )
##                  CCAA Automovil TV_color Video Microondas Lavavajillas Telefono
## 1              España      69.0     97.6  62.4       32.3         17.0     85.2
## 2           Andalucía      66.7     98.0  82.7       24.1         12.7     74.7
## 3              Aragón      67.2     97.5  56.8       43.4         20.6     88.4
## 4            Asturias      63.7     95.2  52.1       24.4         13.3     88.1
## 5            Baleares      71.9     98.8  62.4       29.8         10.1     87.9
## 6            Canarias      72.7     96.8  68.4       27.9          5.8     75.4
## 7           Cantabria      63.4     94.9  48.9       36.5         11.2     80.5
## 8     Castilla y León      65.8     97.1  47.7       28.1         14.0     85.0
## 9  Castilla-La Mancha      65.1     97.3  53.6       21.7          7.1     72.9
## 10           Cataluña      70.4     98.1  71.1       36.8         19.8     92.2
## 11   Comv. Valenciana      72.7     98.4  68.2       26.6         12.1     84.4
## 12        Extremadura      60.5     97.7  43.7       20.7         11.7     67.1
## 13            Galicia      65.5     91.3  42.7       13.5         14.6     85.9
## 14             Madrid      74.0     99.4  76.3       53.9         32.3     95.7
## 15             Murcia      69.0     98.7  59.3       19.5         12.1     81.4
## 16            Navarra      76.4     99.3  60.6       44.0         20.6     87.4
## 17         País Vasco      71.3     98.3  61.6       45.7         23.7     94.3
## 18           La Rioja      64.9     98.6  54.4       44.4         17.6     83.4

Resultados de la detencción de outliers

Datos_num <- Datos_3_3_Caso[, -1] # quita CCAA
# Cálculo de Mahalanobis
mean_vec <- colMeans(Datos_num)
Sx <- cov(Datos_num)
D2 <- mahalanobis(Datos_num, mean_vec, Sx)

# p-values 
p_vals <- pchisq(D2, df = 6, lower.tail = FALSE)

# Valor crítico al 99%
critico <- qchisq(.99, df = 6)

Resultados <- data.frame(
  CCAA = Datos_3_3_Caso$CCAA,
  D2_Mahalanobis = round(D2, 3),
  p_value = round(p_vals, 5)
)

print(Resultados)
##                  CCAA D2_Mahalanobis p_value
## 1              España          0.195 0.99986
## 2           Andalucía         10.912 0.09115
## 3              Aragón          1.938 0.92528
## 4            Asturias          4.707 0.58184
## 5            Baleares          5.702 0.45738
## 6            Canarias          9.151 0.16529
## 7           Cantabria          7.448 0.28143
## 8     Castilla y León          2.281 0.89210
## 9  Castilla-La Mancha          2.407 0.87873
## 10           Cataluña          2.898 0.82153
## 11   Comv. Valenciana          2.572 0.86035
## 12        Extremadura         10.398 0.10888
## 13            Galicia         13.396 0.03715
## 14             Madrid          8.385 0.21121
## 15             Murcia          4.820 0.56714
## 16            Navarra          7.980 0.23959
## 17         País Vasco          2.313 0.88874
## 18           La Rioja          4.497 0.60976
print(paste("Valor crítico 99% =", round(critico, 3)))
## [1] "Valor crítico 99% = 16.812"

Análisis de Conglomerados Jerárquico

Matriz de Distancias

matriz_datos <- as.matrix(Datos_3_3_Caso[, c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono")])
matriz.dis.euclid.caso3 <- dist(matriz_datos, method = "euclidean")
#print(matriz.dis.euclid.caso3)

Dondogramas con diferentes Metodos

# Métodos de aglomeración 
hc_single   <- hclust(matriz.dis.euclid.caso3, method = "single")
hc_complete <- hclust(matriz.dis.euclid.caso3, method = "complete")
hc_average  <- hclust(matriz.dis.euclid.caso3, method = "average")
hc_centroid <- hclust(matriz.dis.euclid.caso3, method = "centroid")
hc_ward     <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")

par(mfrow = c(5, 1))
par(mai = c(0.7, 0.7, 1.0, 0.4))   
par(oma = c(2, 3, 2, 2))          

# 1) Single
plot(hc_single,
     main = "Método Single (Enlace Simple)",
     xlab = "", ylab = "Height")

# 2) Complete
plot(hc_complete,
     main = "Método Complete (Enlace Completo)",
     xlab = "", ylab = "Height")

# 3) Average
plot(hc_average,
     main = "Método Average (UPGMA)",
     xlab = "", ylab = "Height")

# 4) Centroid
plot(hc_centroid,
     main = "Método Centroid (UPGMC)",
     xlab = "", ylab = "Height")

# 5) Ward.D2
plot(hc_ward,
     main = "Método Ward.D2 (Min. Varianza)",
     xlab = "", ylab = "Height")

mtext("Figuras - Dendrogramas mediante distintos métodos de conglomeración",
      side = 2, line = -2, outer = TRUE, cex = 1.3)

Método Average

hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
data.frame(hclust.average.caso3[2:1])

Aplicando NbClust de CC.AA.

library(NbClust)
Datos.NbClust <- Datos_3_3_Caso[, c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono")]
# Aplicado el procedimiento NbClust (para el método Ward.D2)
res.wardD2 <- NbClust(Datos.NbClust, 
                      distance = "euclidean",
                      min.nc = 2, 
                      max.nc = 15, 
                      method = "ward.D2", 
                      index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
print(res.wardD2$Best.nc)
##                     KL      CH Hartigan     CCC    Scott      Marriot   TrCovW
## Number_clusters 3.0000 15.0000   3.0000  2.0000  13.0000 4.000000e+00      3.0
## Value_Index     2.8855 16.2678   4.7124 13.7087 444.6677 1.651845e+14 409368.2
##                  TraceW     Friedman    Rubin Cindex      DB Silhouette   Duda
## Number_clusters   3.000 1.500000e+01   3.0000 2.0000 15.0000    15.0000 2.0000
## Value_Index     779.731 4.687273e+16 -19.9402 0.3675  0.2906     0.7667 0.5526
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial     Gap Frey
## Number_clusters   2.0000  3.0000    3.0000    3.000     4.0000  2.0000    1
## Value_Index       8.0971 -2.3568    0.4182 1020.568     0.5129 -0.6365   NA
##                 McClain Gamma Gplus     Tau    Dunn Hubert SDindex Dindex
## Number_clusters   2.000    15    15  4.0000 15.0000      0  5.0000      0
## Value_Index       0.606     1     0 21.8431  1.0795      0  0.1634      0
##                    SDbw
## Number_clusters 15.0000
## Value_Index      0.0087

Centroides

grupo.ward <- cutree(hc_ward, k = 2)
datos.caso3.grupos <- cbind(Datos_3_3_Caso, grupo.ward)
datos.caso3.grupos
round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3

Análisis de conglomerados no jerárquico mediante el método de k-medias

c1 <- c(66.87,96.82,56.01,25.43,11.81,80.71) 
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23) 
centros_iniciales <- rbind(c1, c2)
solucion <- kmeans(Datos_num, centers = centros_iniciales)

solucion
## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automovil TV_color    Video Microondas Lavavajillas Telefono
## 1  67.16667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2783.8867  848.3533
##  (between_SS / total_SS =  40.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Dendogramas

hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")

plot(hclust.ward.caso3,
     labels = Datos_3_3_Caso$CCAA,
     main = "Dendrograma - Método Ward.D2",
     xlab = "", ylab = "Distancia")

grupos <- cutree(hclust.ward.caso3, k = 2)

rect.hclust(hclust.ward.caso3, k = 2, border = "red")

Datos_con_grupos <- cbind(Datos_3_3_Caso, Cluster = grupos)

Datos_con_grupos
hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")

plot(hclust.complete.caso3,labels=Datos_3_3_Caso$CC.AA.)

grupos <- cutree(hclust.complete.caso3, k = 2)
rect.hclust(hclust.complete.caso3, k = 2, border = "red")

Datos_con_grupos <- cbind(Datos_3_3_Caso, Cluster = grupos)

Datos_con_grupos
hclust.average.caso3<-hclust(matriz.dis.euclid.caso3,method="average")

plot(hclust.average.caso3,labels=Datos_3_3_Caso$CC.AA.)

grupos <- cutree(hclust.average.caso3, k = 2)
rect.hclust(hclust.average.caso3, k = 2, border = "red")

Datos_con_grupos <- cbind(Datos_3_3_Caso, Cluster = grupos)

Datos_con_grupos

Código de las Pruebas \(t\) de Significatividad

library(dplyr)
library(kableExtra)
solucion.cluster <- solucion$cluster

Datos_3_3_Caso$Cluster <- solucion.cluster

# PRUEBAS t 
t1 <- t.test(Automovil    ~ Cluster, data = Datos_3_3_Caso)
t2 <- t.test(TV_color     ~ Cluster, data = Datos_3_3_Caso)
t3 <- t.test(Video        ~ Cluster, data = Datos_3_3_Caso)
t4 <- t.test(Microondas   ~ Cluster, data = Datos_3_3_Caso)
t5 <- t.test(Lavavajillas ~ Cluster, data = Datos_3_3_Caso)
t6 <- t.test(Telefono     ~ Cluster, data = Datos_3_3_Caso)


resultados.ttest <- data.frame(
  Variable = c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono"),
  Prueba_t = c(t1$statistic, t2$statistic, t3$statistic, t4$statistic, t5$statistic, t6$statistic),
  Grupo1   = c(t1$estimate[1], t2$estimate[1], t3$estimate[1], t4$estimate[1], t5$estimate[1], t6$estimate[1]),
  Grupo2   = c(t1$estimate[2], t2$estimate[2], t3$estimate[2], t4$estimate[2], t5$estimate[2], t6$estimate[2])
)

resultados.ttest %>% 
  kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados",
        align = "c", digits = 2) %>% 
  kable_classic(html_font = "Times New Roman", font_size = 14) %>%
  kable_styling()
Significatividad de las diferencias entre los perfiles de los conglomerados
Variable Prueba_t Grupo1 Grupo2
Automovil -1.71 67.17 70.70
TV_color -2.52 96.82 98.53
Video -1.19 57.68 63.47
Microondas -6.73 25.42 44.70
Lavavajillas -4.61 11.81 22.43
Telefono -3.51 80.71 90.23