A32_Alpicaciones_de_los_Conglomerados

Desarrollen el ejemplo 3.7, de la página 114, reproduce todas las salidas.

#Diseño de un plan de insentivos para los vendedores

Carga de Datos

library(kableExtra)
# Crear un data.frame con los datos del cuadro 3.22
Equipamiento <- data.frame(
   CC.AA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", 
             "Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura", 
             "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
   
    Automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 
                  74.0, 69.0, 76.4, 71.3, 64.9),
   
   Tv_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 
                 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
   
   Video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 
              42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
   Microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 
                   13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
   Lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 
                     14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
   Telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 
                 95.7, 81.4, 87.4, 94.3, 83.4)
)
#Visulizar la tabla
kable(Equipamiento)

CC.AA	Automovil	Tv_color	Video	Microondas	Lavavajillas	Telefono
España	69.0	97.6	62.4	32.3	17.0	85.2
Andalucía	66.7	98.0	82.7	24.1	12.7	74.7
Aragón	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Baleares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y León	65.8	97.1	47.7	28.1	14.0	85.0
Castilla-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluña	70.4	98.1	71.1	36.8	19.8	92.2
Com. Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
País Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4

1. Analisis de la existencia de outliers.

Detección de los valores atípicos mediante la distacia de Mahalanobis

library(dplyr)
library(knitr)
#Matriz de datos
matriz_equipamiento <- as.matrix(Equipamiento[, c("Automovil", "Tv_color", "Video", "Microondas", "Lavavajillas", "Telefono")])
#Calculo del vector de medias de las variables
mean <- colMeans(matriz_equipamiento)
#Calculo de la matriz de covarianzas
Sx <- cov(matriz_equipamiento)
#Calculo de la D2 de Mahalanobis
D2 <- mahalanobis(matriz_equipamiento, mean,Sx,inverted = FALSE)
#Muestra la significatividad
P_value <- pchisq(D2, df=6, lower.tail = FALSE)
#Calculo del valor critico
qchisq(.99, df=6)

## [1] 16.81189

#Creacion de dataframe final
Nombres_CC.AA <- Equipamiento$CC.AA
Resultado_df <- data.frame("CC.AA"= Nombres_CC.AA,"D2"= D2,
                           "P_value" = P_value,check.names = FALSE)
#Tabla de resultados
knitr::kable(Resultado_df,caption = "Resultados de la deteccion de Outliers", digits = 2,col.names = c("CC.AA","D2","P_value"))

Resultados de la deteccion de Outliers
CC.AA	D2	P_value
España	0.20	1.00
Andalucía	10.52	0.10
Aragón	1.91	0.93
Asturias	4.46	0.61
Baleares	5.70	0.46
Canarias	9.58	0.14
Cantabria	7.29	0.29
Castilla y León	2.21	0.90
Castilla-La Mancha	3.54	0.74
Cataluña	2.95	0.82
Com. Valenciana	2.65	0.85
Extremadura	10.43	0.11
Galicia	13.24	0.04
Madrid	8.31	0.22
Murcia	4.88	0.56
Navarra	7.65	0.26
País Vasco	2.32	0.89
La Rioja	4.17	0.65

2. Realizacion de un analisis de Conglomerados Jerarquicos.

Metodo Centroide

library(dplyr)
#Calculo de la distancia euclidea
matriz.dis.euclid <- dist(matriz_equipamiento,
                          method = "euclidean",diag = TRUE)
#Efectuamos el cluster con metodo centroide
hclust.centroide <- hclust(matriz.dis.euclid,method = "centroid")
plot(hclust.centroide,labels = Equipamiento$CC.AA)

Metodo del vecino mas cercano(Single)

hclust.single <- hclust(matriz.dis.euclid,method = "single")
plot(hclust.single,labels = Equipamiento$CC.AA)

Metodo del vecion mas lejano(Complete)

hclust.complete <- hclust(matriz.dis.euclid,method = "complete")
plot(hclust.complete,labels = Equipamiento$CC.AA)

Metodo de la vinculacion promedio(Average)

hclust.average <- hclust(matriz.dis.euclid,method = "average")
plot(hclust.average,labels = Equipamiento$CC.AA)

Metodo de Ward

hclust.ward <- hclust(matriz.dis.euclid,method = "ward.D2")
plot(hclust.ward,labels = Equipamiento$CC.AA)

Determinacion del numero de Conglomerados solo para las 3 tecnicas: Ward,Complete y Average.

Para Ward

library(NbClust)
Datos.NbClust  <- matriz_equipamiento
res.wardD2 <- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
                      max.nc = 15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Para Complete

res.complete <- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
                      max.nc = 15, method = "complete", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Para Average

res.Average<- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
                      max.nc = 15, method = "average", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 5 proposed 4 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  15 
##  
##  
## *******************************************************************

Centroides resultantes del metodo jerarquico

grupo.ward <- cutree(hclust.ward,k = 2,h = NULL)
datos.grupos <- cbind(matriz_equipamiento,grupo.ward)
datos.grupos$id <- NULL
round(aggregate(matriz_equipamiento,list(grupo.ward),mean),2)

##   Group.1 Automovil Tv_color Video Microondas Lavavajillas Telefono
## 1       1     66.87    96.82 57.68      25.42        11.81    80.71
## 2       2     70.70    98.53 63.47      44.70        22.43    90.23

2. Realizacion de un analisis de Conglomerados No Jerarquicos.

c1 <- c(66.87,96.82,57.68,25.42,11.81,80.71)
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23)
solucion <- kmeans(matriz_equipamiento, rbind(c1,c2))
print(solucion)

## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automovil Tv_color    Video Microondas Lavavajillas Telefono
## 1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Prueba T(para cada variable dependiente)

library(dplyr)
library(kableExtra)
solucion.cluster <- solucion$cluster
#Automovil
t_test_Automovil <- t.test(Automovil~solucion.cluster,data = matriz_equipamiento)
#tv_color
t_test_Tvcolor <- t.test(Tv_color~solucion.cluster,data = matriz_equipamiento)
#Video
t_test_Video <- t.test(Video~solucion.cluster,data = matriz_equipamiento)
#Microondas
t_test_Microondas <- t.test(Microondas~solucion.cluster,data = matriz_equipamiento)
#Lavavajillas
t_test_Lavavajilla <- t.test(Lavavajillas~solucion.cluster,data = matriz_equipamiento)
#Telefono
t_test_Telefono <- t.test(Telefono~solucion.cluster,data = matriz_equipamiento)

Resultados.ttest <- data.frame(
  Variable = c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono"),
  Grupo1=c(t_test_Automovil$estimate[1],t_test_Tvcolor$estimate[1],t_test_Video$estimate[1],t_test_Microondas$estimate[1],t_test_Lavavajilla$estimate[1],t_test_Telefono$estimate[1]),
  Grupo2=c(t_test_Automovil$estimate[2],t_test_Tvcolor$estimate[2],t_test_Video$estimate[2],t_test_Microondas$estimate[2],t_test_Lavavajilla$estimate[2],t_test_Telefono$estimate[2]),
   # Aplica la función abs() para asegurarte de que las estadísticas t sean siempre positivas
 Pruebas_t = c(abs(t_test_Automovil$statistic), abs(t_test_Tvcolor$statistic), abs(t_test_Video$statistic), abs(t_test_Microondas$statistic), abs(t_test_Lavavajilla$statistic), abs(t_test_Telefono$statistic)))

head(Resultados.ttest)%>% kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados", align = "c",digits = 2) %>%  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

Significatividad de las diferencias entre los perfiles de los conglomerados
Variable	Grupo1	Grupo2	Pruebas_t
Automovil	66.87	70.70	1.81
TVcolor	96.82	98.53	2.52
Video	57.68	63.47	1.19
Microondas	25.42	44.70	6.73
Lavavajillas	11.81	22.43	4.61
Telefono	80.71	90.23	3.51

Visualización de los resultados de un análisis de conglomerados

library(ggplot2)

mds_result <- cmdscale(matriz.dis.euclid, k = 2)
coordenadas_mds <- data.frame(Componente_principal_1 = mds_result[,1], Componente_principal_2 = mds_result[,2])

ciudades <- c("España", "Andalucia", "Aragon", "Asturias", "Balerares", "Canarias", "Cantabria", 
              "CyL", "CLM", "Cataluña", "Valenciana", 
              "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")

coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion.cluster)

ggplot(coordenadas_mds, aes(x = Componente_principal_1, y = Componente_principal_2, color = Cluster)) +
  geom_point(size = 3) +
  geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +  
  labs(x = "Componente_principal_1", y = "Componente_principal_2", 
       title = "Visualizacion MDS") +
  theme_minimal() +
  theme(legend.position = "bottom")