A32_Alpicaciones_de_los_Conglomerados

Desarrollen el ejemplo 3.7, de la página 114, reproduce todas las salidas.

#Diseño de un plan de insentivos para los vendedores

Carga de Datos

library(kableExtra)
# Crear un data.frame con los datos del cuadro 3.22
Equipamiento <- data.frame(
   CC.AA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", 
             "Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura", 
             "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
   
    Automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 
                  74.0, 69.0, 76.4, 71.3, 64.9),
   
   Tv_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 
                 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
   
   Video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 
              42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
   Microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 
                   13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
   Lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 
                     14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
   Telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 
                 95.7, 81.4, 87.4, 94.3, 83.4)
)
#Visulizar la tabla
kable(Equipamiento)
CC.AA Automovil Tv_color Video Microondas Lavavajillas Telefono
España 69.0 97.6 62.4 32.3 17.0 85.2
Andalucía 66.7 98.0 82.7 24.1 12.7 74.7
Aragón 67.2 97.5 56.8 43.4 20.6 88.4
Asturias 63.7 95.2 52.1 24.4 13.3 88.1
Baleares 71.9 98.8 62.4 29.8 10.1 87.9
Canarias 72.7 96.8 68.4 27.9 5.8 75.4
Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
Castilla y León 65.8 97.1 47.7 28.1 14.0 85.0
Castilla-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
Cataluña 70.4 98.1 71.1 36.8 19.8 92.2
Com. Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
Galicia 65.5 91.3 42.7 13.5 14.6 85.9
Madrid 74.0 99.4 76.3 53.9 32.3 95.7
Murcia 69.0 98.7 59.3 19.5 12.1 81.4
Navarra 76.4 99.3 60.6 44.0 20.6 87.4
País Vasco 71.3 98.3 61.6 45.7 23.7 94.3
La Rioja 64.9 98.6 54.4 44.4 17.6 83.4

1. Analisis de la existencia de outliers.

Detección de los valores atípicos mediante la distacia de Mahalanobis

library(dplyr)
library(knitr)
#Matriz de datos
matriz_equipamiento <- as.matrix(Equipamiento[, c("Automovil", "Tv_color", "Video", "Microondas", "Lavavajillas", "Telefono")])
#Calculo del vector de medias de las variables
mean <- colMeans(matriz_equipamiento)
#Calculo de la matriz de covarianzas
Sx <- cov(matriz_equipamiento)
#Calculo de la D2 de Mahalanobis
D2 <- mahalanobis(matriz_equipamiento, mean,Sx,inverted = FALSE)
#Muestra la significatividad
P_value <- pchisq(D2, df=6, lower.tail = FALSE)
#Calculo del valor critico
qchisq(.99, df=6)
## [1] 16.81189
#Creacion de dataframe final
Nombres_CC.AA <- Equipamiento$CC.AA
Resultado_df <- data.frame("CC.AA"= Nombres_CC.AA,"D2"= D2,
                           "P_value" = P_value,check.names = FALSE)
#Tabla de resultados
knitr::kable(Resultado_df,caption = "Resultados de la deteccion de Outliers", digits = 2,col.names = c("CC.AA","D2","P_value"))
Resultados de la deteccion de Outliers
CC.AA D2 P_value
España 0.20 1.00
Andalucía 10.52 0.10
Aragón 1.91 0.93
Asturias 4.46 0.61
Baleares 5.70 0.46
Canarias 9.58 0.14
Cantabria 7.29 0.29
Castilla y León 2.21 0.90
Castilla-La Mancha 3.54 0.74
Cataluña 2.95 0.82
Com. Valenciana 2.65 0.85
Extremadura 10.43 0.11
Galicia 13.24 0.04
Madrid 8.31 0.22
Murcia 4.88 0.56
Navarra 7.65 0.26
País Vasco 2.32 0.89
La Rioja 4.17 0.65

2. Realizacion de un analisis de Conglomerados Jerarquicos.

Metodo Centroide

library(dplyr)
#Calculo de la distancia euclidea
matriz.dis.euclid <- dist(matriz_equipamiento,
                          method = "euclidean",diag = TRUE)
#Efectuamos el cluster con metodo centroide
hclust.centroide <- hclust(matriz.dis.euclid,method = "centroid")
plot(hclust.centroide,labels = Equipamiento$CC.AA)

Metodo del vecino mas cercano(Single)

hclust.single <- hclust(matriz.dis.euclid,method = "single")
plot(hclust.single,labels = Equipamiento$CC.AA)

Metodo del vecion mas lejano(Complete)

hclust.complete <- hclust(matriz.dis.euclid,method = "complete")
plot(hclust.complete,labels = Equipamiento$CC.AA)

Metodo de la vinculacion promedio(Average)

hclust.average <- hclust(matriz.dis.euclid,method = "average")
plot(hclust.average,labels = Equipamiento$CC.AA)

Metodo de Ward

hclust.ward <- hclust(matriz.dis.euclid,method = "ward.D2")
plot(hclust.ward,labels = Equipamiento$CC.AA)

Determinacion del numero de Conglomerados solo para las 3 tecnicas: Ward,Complete y Average.

Para Ward

library(NbClust)
Datos.NbClust  <- matriz_equipamiento
res.wardD2 <- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
                      max.nc = 15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Para Complete

res.complete <- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
                      max.nc = 15, method = "complete", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Para Average

res.Average<- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
                      max.nc = 15, method = "average", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 5 proposed 4 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  15 
##  
##  
## *******************************************************************

Centroides resultantes del metodo jerarquico

grupo.ward <- cutree(hclust.ward,k = 2,h = NULL)
datos.grupos <- cbind(matriz_equipamiento,grupo.ward)
datos.grupos$id <- NULL
round(aggregate(matriz_equipamiento,list(grupo.ward),mean),2)
##   Group.1 Automovil Tv_color Video Microondas Lavavajillas Telefono
## 1       1     66.87    96.82 57.68      25.42        11.81    80.71
## 2       2     70.70    98.53 63.47      44.70        22.43    90.23

2. Realizacion de un analisis de Conglomerados No Jerarquicos.

c1 <- c(66.87,96.82,57.68,25.42,11.81,80.71)
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23)
solucion <- kmeans(matriz_equipamiento, rbind(c1,c2))
print(solucion)
## K-means clustering with 2 clusters of sizes 12, 6
## 
## Cluster means:
##   Automovil Tv_color    Video Microondas Lavavajillas Telefono
## 1  66.86667 96.81667 57.67500     25.425     11.80833 80.70833
## 2  70.70000 98.53333 63.46667     44.700     22.43333 90.23333
## 
## Clustering vector:
##  [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 2810.6467  848.3533
##  (between_SS / total_SS =  40.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Prueba T(para cada variable dependiente)

library(dplyr)
library(kableExtra)
solucion.cluster <- solucion$cluster
#Automovil
t_test_Automovil <- t.test(Automovil~solucion.cluster,data = matriz_equipamiento)
#tv_color
t_test_Tvcolor <- t.test(Tv_color~solucion.cluster,data = matriz_equipamiento)
#Video
t_test_Video <- t.test(Video~solucion.cluster,data = matriz_equipamiento)
#Microondas
t_test_Microondas <- t.test(Microondas~solucion.cluster,data = matriz_equipamiento)
#Lavavajillas
t_test_Lavavajilla <- t.test(Lavavajillas~solucion.cluster,data = matriz_equipamiento)
#Telefono
t_test_Telefono <- t.test(Telefono~solucion.cluster,data = matriz_equipamiento)

Resultados.ttest <- data.frame(
  Variable = c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono"),
  Grupo1=c(t_test_Automovil$estimate[1],t_test_Tvcolor$estimate[1],t_test_Video$estimate[1],t_test_Microondas$estimate[1],t_test_Lavavajilla$estimate[1],t_test_Telefono$estimate[1]),
  Grupo2=c(t_test_Automovil$estimate[2],t_test_Tvcolor$estimate[2],t_test_Video$estimate[2],t_test_Microondas$estimate[2],t_test_Lavavajilla$estimate[2],t_test_Telefono$estimate[2]),
   # Aplica la función abs() para asegurarte de que las estadísticas t sean siempre positivas
 Pruebas_t = c(abs(t_test_Automovil$statistic), abs(t_test_Tvcolor$statistic), abs(t_test_Video$statistic), abs(t_test_Microondas$statistic), abs(t_test_Lavavajilla$statistic), abs(t_test_Telefono$statistic)))

head(Resultados.ttest)%>% kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados", align = "c",digits = 2) %>%  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()
Significatividad de las diferencias entre los perfiles de los conglomerados
Variable Grupo1 Grupo2 Pruebas_t
Automovil 66.87 70.70 1.81
TVcolor 96.82 98.53 2.52
Video 57.68 63.47 1.19
Microondas 25.42 44.70 6.73
Lavavajillas 11.81 22.43 4.61
Telefono 80.71 90.23 3.51

Visualización de los resultados de un análisis de conglomerados

library(ggplot2)

mds_result <- cmdscale(matriz.dis.euclid, k = 2)
coordenadas_mds <- data.frame(Componente_principal_1 = mds_result[,1], Componente_principal_2 = mds_result[,2])

ciudades <- c("España", "Andalucia", "Aragon", "Asturias", "Balerares", "Canarias", "Cantabria", 
              "CyL", "CLM", "Cataluña", "Valenciana", 
              "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")

coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion.cluster)

ggplot(coordenadas_mds, aes(x = Componente_principal_1, y = Componente_principal_2, color = Cluster)) +
  geom_point(size = 3) +
  geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +  
  labs(x = "Componente_principal_1", y = "Componente_principal_2", 
       title = "Visualizacion MDS") +
  theme_minimal() +
  theme(legend.position = "bottom")