A33 Análisis de conglomerados

PRESENTADO POR:

INTEGRANTES	CARNET
LOPEZ RIVAS, ROXANA BEATRIZ	LR12002
CORTEZ CARRILLO, KARLA XIOMARA	CC19020
LEON REYES, CESIA YASMIN	LR21033
ZOMETA MORAN, KIMBERLY YAMILETH	ZM21018

Análisis de conglomerados

Caso 3.3: Diseño de un plan de incentivos para vendedores

Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autónomas

library(dplyr)
library(kableExtra)

Cuadro_3_22 <- data.frame(
    CC.AA. = c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "Pais Vasco", "La Rioja"),
    automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
    tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
    video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
    microond = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
    lavavaji = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
    telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
    )

Cuadro_3_22 %>% kable(caption = "Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autonomas") %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14) %>% 
  add_footnote(label="Panel de Hogares de la Union Europea. INE.",
               notation="symbol")

Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autonomas
CC.AA.	automovil	tvcolor	video	microond	lavavaji	telefono
Espana	69.0	97.6	62.4	32.3	17.0	85.2
Andalucia	66.7	98.0	82.7	24.1	12.7	74.7
Aragon	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Baleares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y Leon	65.8	97.1	47.7	28.1	14.0	85.0
Castilla-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluna	70.4	98.1	71.1	36.8	19.8	92.2
Com. Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
Pais Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4
^* Panel de Hogares de la Union Europea. INE.

#Matriz de datos
matriz_datos_3_3 <- as.matrix(Cuadro_3_22[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")])

Outliers metiante la distancia de Mahalanobis

mean <- colMeans(matriz_datos_3_3)
Sx <- cov(matriz_datos_3_3)
D2 <- mahalanobis(matriz_datos_3_3,mean,Sx,inverted = FALSE)
sig <- pchisq(D2,df = 6,lower.tail = FALSE)

resultados_outliers <- data.frame(CC.AA. = c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "Pais Vasco", "La Rioja"),
                                  D2 = D2, Significatividades = sig)

resultados_outliers %>% kable(caption = "Resultados de la deteccion de outliers") %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14)

Resultados de la deteccion de outliers
CC.AA.	D2	Significatividades
Espana	0.1957042	0.9998549
Andalucia	10.5207676	0.1043658
Aragon	1.9097471	0.9278091
Asturias	4.4628800	0.6142959
Baleares	5.7026801	0.4573061
Canarias	9.5791345	0.1435313
Cantabria	7.2910646	0.2947664
Castilla y Leon	2.2064805	0.8997629
Castilla-La Mancha	3.5390966	0.7387610
Cataluna	2.9473889	0.8154205
Com. Valenciana	2.6469747	0.8516698
Extremadura	10.4333455	0.1075495
Galicia	13.2415547	0.0393564
Madrid	8.3056697	0.2165535
Murcia	4.8751556	0.5599224
Navarra	7.6517494	0.2647430
Pais Vasco	2.3173225	0.8883259
La Rioja	4.1732837	0.6532394

Análisis de conglomerados jerárquicos

matriz.dis.euclid.caso3 <- dist(matriz_datos_3_3, method = "euclidean")

# Promedio (average)
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,method = "average")

plot(hclust.average.caso3)

# Ward (ward.D2)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3,method = "ward.D2")

plot(hclust.ward.caso3)

# Vecino mas lejano (complete)
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3,method = "complete")

plot(hclust.complete.caso3)

# Vecino mas cercano (single)
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3,method = "single")

plot(hclust.single.caso3)

# Centroide (centroid)
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3,method = "centroid")

plot(hclust.centroid.caso3)

Determinación del número de conglomerados

library(NbClust)

# Ward (ward.D2)
res.wardD2 <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

# Vecino mas lejano (complete)
res.complete <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "complete", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides resultantes del método jerárquico

grupo.ward <- cutree(hclust.ward.caso3,k = 2,h = NULL)
datos.caso3.grupos <- cbind(matriz_datos_3_3,grupo.ward)

round(aggregate(datos.caso3.grupos,list(grupo.ward),mean),2)

##   Group.1 automovil tvcolor video microond lavavaji telefono grupo.ward
## 1       1     66.87   96.82 57.68    25.42    11.81    80.71          1
## 2       2     70.70   98.53 63.47    44.70    22.43    90.23          2

Análisis de conglomerados no jerárquicos

datos.caso3.grupos.kmeans <- datos.caso3.grupos[, -ncol(datos.caso3.grupos)]

c1 <- c(70.52,  98.29,  65.35,  35.70,  17.03,  85.87)
c2 <- c(63.40,  95.58,  48.12,  24.15,  11.98,  79.92)
solucion_kmeans <- kmeans(datos.caso3.grupos.kmeans, rbind(c1,c2)) %>% print()

## K-means clustering with 2 clusters of sizes 11, 7
## 
## Cluster means:
##   automovil  tvcolor    video microond lavavaji telefono
## 1  70.65455 98.25455 65.90000 37.17273 17.48182 86.27273
## 2  64.20000 96.02857 49.71429 23.48571 12.00000 80.12857
## 
## Clustering vector:
##  [1] 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2764.2745  988.5257
##  (between_SS / total_SS =  39.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Prueba T

# automovil
t_test_automovil <- t.test(automovil~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# tvcolor
t_test_tvcolor <- t.test(tvcolor~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# video
t_test_video <- t.test(video~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# microondas
t_test_microond <- t.test(microond~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# lavavajillas
t_test_lavavaji <- t.test(lavavaji~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# telefono
t_test_telefono <- t.test(telefono~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# tabla de resultados
resultados_t_test <- data.frame(
  Variable = c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono"),
  Grupo_1 = c(
    t_test_automovil[["estimate"]][["mean in group 1"]],
    t_test_tvcolor[["estimate"]][["mean in group 1"]],
    t_test_video[["estimate"]][["mean in group 1"]],
    t_test_microond[["estimate"]][["mean in group 1"]],
    t_test_lavavaji[["estimate"]][["mean in group 1"]],
    t_test_telefono[["estimate"]][["mean in group 1"]]
  ),
  Grupo_2 = c(
    t_test_automovil[["estimate"]][["mean in group 2"]],
    t_test_tvcolor[["estimate"]][["mean in group 2"]],
    t_test_video[["estimate"]][["mean in group 2"]],
    t_test_microond[["estimate"]][["mean in group 2"]],
    t_test_lavavaji[["estimate"]][["mean in group 2"]],
    t_test_telefono[["estimate"]][["mean in group 2"]]
  ),
  Prueba_t = c(
    t_test_automovil[["statistic"]][["t"]],
    t_test_tvcolor[["statistic"]][["t"]],
    t_test_video[["statistic"]][["t"]],
    t_test_microond[["statistic"]][["t"]],
    t_test_lavavaji[["statistic"]][["t"]],
    t_test_telefono[["statistic"]][["t"]]
  ),
    P_value = c(
    t_test_automovil[["p.value"]],
    t_test_tvcolor[["p.value"]],
    t_test_video[["p.value"]],
    t_test_microond[["p.value"]],
    t_test_lavavaji[["p.value"]],
    t_test_telefono[["p.value"]]
    )
)

resultados_t_test %>% kable(caption = "Resultados T test") %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14)

Resultados T test
Variable	Grupo_1	Grupo_2	Prueba_t	P_value
automovil	70.65455	64.20000	4.309306	0.0006522
tvcolor	98.25455	96.02857	2.300489	0.0562670
video	65.90000	49.71429	4.808811	0.0001987
microond	37.17273	23.48571	3.412048	0.0037381
lavavaji	17.48182	12.00000	2.302470	0.0380856
telefono	86.27273	80.12857	1.744970	0.1068308

Visualización de los resultados del análisis de conglomerados

library(ggplot2)

mds_result <- cmdscale(matriz.dis.euclid.caso3, k = 2)
coordenadas_mds <- data.frame(Dim1 = mds_result[,1], Dim2 = mds_result[,2])

ciudades <- c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", 
              "CyL", "CLM", "Cataluna", "Valencia", 
              "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")

coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion_kmeans[["cluster"]])

ggplot(coordenadas_mds, aes(x = Dim1, y = Dim2, color = Cluster)) +
  geom_point(size = 3) +
  geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +  
  labs(x = "Dimension 1", y = "Dimension 2", 
       title = "Visualizacion MDS") +
  theme_minimal() +
  theme(legend.position = "bottom")