A33 Análisis de conglomerados

PRESENTADO POR:

INTEGRANTES CARNET
LOPEZ RIVAS, ROXANA BEATRIZ LR12002
CORTEZ CARRILLO, KARLA XIOMARA CC19020
LEON REYES, CESIA YASMIN LR21033
ZOMETA MORAN, KIMBERLY YAMILETH ZM21018

Análisis de conglomerados

Caso 3.3: Diseño de un plan de incentivos para vendedores

Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autónomas

library(dplyr)
library(kableExtra)

Cuadro_3_22 <- data.frame(
    CC.AA. = c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "Pais Vasco", "La Rioja"),
    automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
    tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
    video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
    microond = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
    lavavaji = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
    telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
    )

Cuadro_3_22 %>% kable(caption = "Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autonomas") %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14) %>% 
  add_footnote(label="Panel de Hogares de la Union Europea. INE.",
               notation="symbol")
Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autonomas
CC.AA. automovil tvcolor video microond lavavaji telefono
Espana 69.0 97.6 62.4 32.3 17.0 85.2
Andalucia 66.7 98.0 82.7 24.1 12.7 74.7
Aragon 67.2 97.5 56.8 43.4 20.6 88.4
Asturias 63.7 95.2 52.1 24.4 13.3 88.1
Baleares 71.9 98.8 62.4 29.8 10.1 87.9
Canarias 72.7 96.8 68.4 27.9 5.8 75.4
Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
Castilla y Leon 65.8 97.1 47.7 28.1 14.0 85.0
Castilla-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
Cataluna 70.4 98.1 71.1 36.8 19.8 92.2
Com. Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
Galicia 65.5 91.3 42.7 13.5 14.6 85.9
Madrid 74.0 99.4 76.3 53.9 32.3 95.7
Murcia 69.0 98.7 59.3 19.5 12.1 81.4
Navarra 76.4 99.3 60.6 44.0 20.6 87.4
Pais Vasco 71.3 98.3 61.6 45.7 23.7 94.3
La Rioja 64.9 98.6 54.4 44.4 17.6 83.4
* Panel de Hogares de la Union Europea. INE.
#Matriz de datos
matriz_datos_3_3 <- as.matrix(Cuadro_3_22[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")])

Outliers metiante la distancia de Mahalanobis

mean <- colMeans(matriz_datos_3_3)
Sx <- cov(matriz_datos_3_3)
D2 <- mahalanobis(matriz_datos_3_3,mean,Sx,inverted = FALSE)
sig <- pchisq(D2,df = 6,lower.tail = FALSE)

resultados_outliers <- data.frame(CC.AA. = c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "Pais Vasco", "La Rioja"),
                                  D2 = D2, Significatividades = sig)

resultados_outliers %>% kable(caption = "Resultados de la deteccion de outliers") %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14)
Resultados de la deteccion de outliers
CC.AA. D2 Significatividades
Espana 0.1957042 0.9998549
Andalucia 10.5207676 0.1043658
Aragon 1.9097471 0.9278091
Asturias 4.4628800 0.6142959
Baleares 5.7026801 0.4573061
Canarias 9.5791345 0.1435313
Cantabria 7.2910646 0.2947664
Castilla y Leon 2.2064805 0.8997629
Castilla-La Mancha 3.5390966 0.7387610
Cataluna 2.9473889 0.8154205
Com. Valenciana 2.6469747 0.8516698
Extremadura 10.4333455 0.1075495
Galicia 13.2415547 0.0393564
Madrid 8.3056697 0.2165535
Murcia 4.8751556 0.5599224
Navarra 7.6517494 0.2647430
Pais Vasco 2.3173225 0.8883259
La Rioja 4.1732837 0.6532394

Análisis de conglomerados jerárquicos

matriz.dis.euclid.caso3 <- dist(matriz_datos_3_3, method = "euclidean")

# Promedio (average)
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,method = "average")

plot(hclust.average.caso3)

# Ward (ward.D2)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3,method = "ward.D2")

plot(hclust.ward.caso3)

# Vecino mas lejano (complete)
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3,method = "complete")

plot(hclust.complete.caso3)

# Vecino mas cercano (single)
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3,method = "single")

plot(hclust.single.caso3)

# Centroide (centroid)
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3,method = "centroid")

plot(hclust.centroid.caso3)

Determinación del número de conglomerados

library(NbClust)

# Ward (ward.D2)
res.wardD2 <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
# Vecino mas lejano (complete)
res.complete <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "complete", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides resultantes del método jerárquico

grupo.ward <- cutree(hclust.ward.caso3,k = 2,h = NULL)
datos.caso3.grupos <- cbind(matriz_datos_3_3,grupo.ward)

round(aggregate(datos.caso3.grupos,list(grupo.ward),mean),2)
##   Group.1 automovil tvcolor video microond lavavaji telefono grupo.ward
## 1       1     66.87   96.82 57.68    25.42    11.81    80.71          1
## 2       2     70.70   98.53 63.47    44.70    22.43    90.23          2

Análisis de conglomerados no jerárquicos

datos.caso3.grupos.kmeans <- datos.caso3.grupos[, -ncol(datos.caso3.grupos)]

c1 <- c(70.52,  98.29,  65.35,  35.70,  17.03,  85.87)
c2 <- c(63.40,  95.58,  48.12,  24.15,  11.98,  79.92)
solucion_kmeans <- kmeans(datos.caso3.grupos.kmeans, rbind(c1,c2)) %>% print()
## K-means clustering with 2 clusters of sizes 11, 7
## 
## Cluster means:
##   automovil  tvcolor    video microond lavavaji telefono
## 1  70.65455 98.25455 65.90000 37.17273 17.48182 86.27273
## 2  64.20000 96.02857 49.71429 23.48571 12.00000 80.12857
## 
## Clustering vector:
##  [1] 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2764.2745  988.5257
##  (between_SS / total_SS =  39.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Prueba T

# automovil
t_test_automovil <- t.test(automovil~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# tvcolor
t_test_tvcolor <- t.test(tvcolor~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# video
t_test_video <- t.test(video~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# microondas
t_test_microond <- t.test(microond~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# lavavajillas
t_test_lavavaji <- t.test(lavavaji~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# telefono
t_test_telefono <- t.test(telefono~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# tabla de resultados
resultados_t_test <- data.frame(
  Variable = c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono"),
  Grupo_1 = c(
    t_test_automovil[["estimate"]][["mean in group 1"]],
    t_test_tvcolor[["estimate"]][["mean in group 1"]],
    t_test_video[["estimate"]][["mean in group 1"]],
    t_test_microond[["estimate"]][["mean in group 1"]],
    t_test_lavavaji[["estimate"]][["mean in group 1"]],
    t_test_telefono[["estimate"]][["mean in group 1"]]
  ),
  Grupo_2 = c(
    t_test_automovil[["estimate"]][["mean in group 2"]],
    t_test_tvcolor[["estimate"]][["mean in group 2"]],
    t_test_video[["estimate"]][["mean in group 2"]],
    t_test_microond[["estimate"]][["mean in group 2"]],
    t_test_lavavaji[["estimate"]][["mean in group 2"]],
    t_test_telefono[["estimate"]][["mean in group 2"]]
  ),
  Prueba_t = c(
    t_test_automovil[["statistic"]][["t"]],
    t_test_tvcolor[["statistic"]][["t"]],
    t_test_video[["statistic"]][["t"]],
    t_test_microond[["statistic"]][["t"]],
    t_test_lavavaji[["statistic"]][["t"]],
    t_test_telefono[["statistic"]][["t"]]
  ),
    P_value = c(
    t_test_automovil[["p.value"]],
    t_test_tvcolor[["p.value"]],
    t_test_video[["p.value"]],
    t_test_microond[["p.value"]],
    t_test_lavavaji[["p.value"]],
    t_test_telefono[["p.value"]]
    )
)

resultados_t_test %>% kable(caption = "Resultados T test") %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14)
Resultados T test
Variable Grupo_1 Grupo_2 Prueba_t P_value
automovil 70.65455 64.20000 4.309306 0.0006522
tvcolor 98.25455 96.02857 2.300489 0.0562670
video 65.90000 49.71429 4.808811 0.0001987
microond 37.17273 23.48571 3.412048 0.0037381
lavavaji 17.48182 12.00000 2.302470 0.0380856
telefono 86.27273 80.12857 1.744970 0.1068308

Visualización de los resultados del análisis de conglomerados

library(ggplot2)

mds_result <- cmdscale(matriz.dis.euclid.caso3, k = 2)
coordenadas_mds <- data.frame(Dim1 = mds_result[,1], Dim2 = mds_result[,2])

ciudades <- c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", 
              "CyL", "CLM", "Cataluna", "Valencia", 
              "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")

coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion_kmeans[["cluster"]])

ggplot(coordenadas_mds, aes(x = Dim1, y = Dim2, color = Cluster)) +
  geom_point(size = 3) +
  geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +  
  labs(x = "Dimension 1", y = "Dimension 2", 
       title = "Visualizacion MDS") +
  theme_minimal() +
  theme(legend.position = "bottom")