A32_Alpicaciones_de_los_Conglomerados
Desarrollen el ejemplo 3.7, de la página 114, reproduce todas las salidas.
#Diseño de un plan de insentivos para los vendedores
Carga de Datos
library(kableExtra)
# Crear un data.frame con los datos del cuadro 3.22
Equipamiento <- data.frame(
CC.AA = c("España", "AndalucÃa", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria",
"Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura",
"Galicia", "Madrid", "Murcia", "Navarra", "PaÃs Vasco", "La Rioja"),
Automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5,
74.0, 69.0, 76.4, 71.3, 64.9),
Tv_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7,
91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
Video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7,
42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
Microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7,
13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
Lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7,
14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
Telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9,
95.7, 81.4, 87.4, 94.3, 83.4)
)
#Visulizar la tabla
kable(Equipamiento)| CC.AA | Automovil | Tv_color | Video | Microondas | Lavavajillas | Telefono |
|---|---|---|---|---|---|---|
| España | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| AndalucÃa | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragón | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Baleares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
| Cantabria | 63.4 | 94.9 | 48.9 | 36.5 | 11.2 | 80.5 |
| Castilla y León | 65.8 | 97.1 | 47.7 | 28.1 | 14.0 | 85.0 |
| Castilla-La Mancha | 61.5 | 97.3 | 53.6 | 21.7 | 7.1 | 72.9 |
| Cataluña | 70.4 | 98.1 | 71.1 | 36.8 | 19.8 | 92.2 |
| Com. Valenciana | 72.7 | 98.4 | 68.2 | 26.6 | 12.1 | 84.4 |
| Extremadura | 60.5 | 97.7 | 43.7 | 20.7 | 11.7 | 67.1 |
| Galicia | 65.5 | 91.3 | 42.7 | 13.5 | 14.6 | 85.9 |
| Madrid | 74.0 | 99.4 | 76.3 | 53.9 | 32.3 | 95.7 |
| Murcia | 69.0 | 98.7 | 59.3 | 19.5 | 12.1 | 81.4 |
| Navarra | 76.4 | 99.3 | 60.6 | 44.0 | 20.6 | 87.4 |
| PaÃs Vasco | 71.3 | 98.3 | 61.6 | 45.7 | 23.7 | 94.3 |
| La Rioja | 64.9 | 98.6 | 54.4 | 44.4 | 17.6 | 83.4 |
1. Analisis de la existencia de outliers.
Detección de los valores atÃpicos mediante la distacia de Mahalanobis
library(dplyr)
library(knitr)
#Matriz de datos
matriz_equipamiento <- as.matrix(Equipamiento[, c("Automovil", "Tv_color", "Video", "Microondas", "Lavavajillas", "Telefono")])
#Calculo del vector de medias de las variables
mean <- colMeans(matriz_equipamiento)
#Calculo de la matriz de covarianzas
Sx <- cov(matriz_equipamiento)
#Calculo de la D2 de Mahalanobis
D2 <- mahalanobis(matriz_equipamiento, mean,Sx,inverted = FALSE)
#Muestra la significatividad
P_value <- pchisq(D2, df=6, lower.tail = FALSE)
#Calculo del valor critico
qchisq(.99, df=6)## [1] 16.81189
#Creacion de dataframe final
Nombres_CC.AA <- Equipamiento$CC.AA
Resultado_df <- data.frame("CC.AA"= Nombres_CC.AA,"D2"= D2,
"P_value" = P_value,check.names = FALSE)
#Tabla de resultados
knitr::kable(Resultado_df,caption = "Resultados de la deteccion de Outliers", digits = 2,col.names = c("CC.AA","D2","P_value"))| CC.AA | D2 | P_value |
|---|---|---|
| España | 0.20 | 1.00 |
| AndalucÃa | 10.52 | 0.10 |
| Aragón | 1.91 | 0.93 |
| Asturias | 4.46 | 0.61 |
| Baleares | 5.70 | 0.46 |
| Canarias | 9.58 | 0.14 |
| Cantabria | 7.29 | 0.29 |
| Castilla y León | 2.21 | 0.90 |
| Castilla-La Mancha | 3.54 | 0.74 |
| Cataluña | 2.95 | 0.82 |
| Com. Valenciana | 2.65 | 0.85 |
| Extremadura | 10.43 | 0.11 |
| Galicia | 13.24 | 0.04 |
| Madrid | 8.31 | 0.22 |
| Murcia | 4.88 | 0.56 |
| Navarra | 7.65 | 0.26 |
| PaÃs Vasco | 2.32 | 0.89 |
| La Rioja | 4.17 | 0.65 |
2. Realizacion de un analisis de Conglomerados Jerarquicos.
Metodo Centroide
library(dplyr)
#Calculo de la distancia euclidea
matriz.dis.euclid <- dist(matriz_equipamiento,
method = "euclidean",diag = TRUE)
#Efectuamos el cluster con metodo centroide
hclust.centroide <- hclust(matriz.dis.euclid,method = "centroid")
plot(hclust.centroide,labels = Equipamiento$CC.AA)Metodo del vecino mas cercano(Single)
hclust.single <- hclust(matriz.dis.euclid,method = "single")
plot(hclust.single,labels = Equipamiento$CC.AA)Metodo del vecion mas lejano(Complete)
hclust.complete <- hclust(matriz.dis.euclid,method = "complete")
plot(hclust.complete,labels = Equipamiento$CC.AA)Metodo de la vinculacion promedio(Average)
hclust.average <- hclust(matriz.dis.euclid,method = "average")
plot(hclust.average,labels = Equipamiento$CC.AA)Metodo de Ward
hclust.ward <- hclust(matriz.dis.euclid,method = "ward.D2")
plot(hclust.ward,labels = Equipamiento$CC.AA)Determinacion del numero de Conglomerados solo para las 3 tecnicas: Ward,Complete y Average.
Para Ward
library(NbClust)
Datos.NbClust <- matriz_equipamiento
res.wardD2 <- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
max.nc = 15, method = "ward.D2", index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Para Complete
res.complete <- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
max.nc = 15, method = "complete", index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Para Average
res.Average<- NbClust(Datos.NbClust,distance= "euclidean",min.nc = 2,
max.nc = 15, method = "average", index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 5 proposed 4 as the best number of clusters
## * 3 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 15
##
##
## *******************************************************************
Centroides resultantes del metodo jerarquico
grupo.ward <- cutree(hclust.ward,k = 2,h = NULL)
datos.grupos <- cbind(matriz_equipamiento,grupo.ward)
datos.grupos$id <- NULL
round(aggregate(matriz_equipamiento,list(grupo.ward),mean),2)## Group.1 Automovil Tv_color Video Microondas Lavavajillas Telefono
## 1 1 66.87 96.82 57.68 25.42 11.81 80.71
## 2 2 70.70 98.53 63.47 44.70 22.43 90.23
2. Realizacion de un analisis de Conglomerados No Jerarquicos.
c1 <- c(66.87,96.82,57.68,25.42,11.81,80.71)
c2 <- c(70.70,98.53,63.47,44.70,22.43,90.23)
solucion <- kmeans(matriz_equipamiento, rbind(c1,c2))
print(solucion)## K-means clustering with 2 clusters of sizes 12, 6
##
## Cluster means:
## Automovil Tv_color Video Microondas Lavavajillas Telefono
## 1 66.86667 96.81667 57.67500 25.425 11.80833 80.70833
## 2 70.70000 98.53333 63.46667 44.700 22.43333 90.23333
##
## Clustering vector:
## [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2810.6467 848.3533
## (between_SS / total_SS = 40.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Prueba T(para cada variable dependiente)
library(dplyr)
library(kableExtra)
solucion.cluster <- solucion$cluster
#Automovil
t_test_Automovil <- t.test(Automovil~solucion.cluster,data = matriz_equipamiento)
#tv_color
t_test_Tvcolor <- t.test(Tv_color~solucion.cluster,data = matriz_equipamiento)
#Video
t_test_Video <- t.test(Video~solucion.cluster,data = matriz_equipamiento)
#Microondas
t_test_Microondas <- t.test(Microondas~solucion.cluster,data = matriz_equipamiento)
#Lavavajillas
t_test_Lavavajilla <- t.test(Lavavajillas~solucion.cluster,data = matriz_equipamiento)
#Telefono
t_test_Telefono <- t.test(Telefono~solucion.cluster,data = matriz_equipamiento)
Resultados.ttest <- data.frame(
Variable = c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono"),
Grupo1=c(t_test_Automovil$estimate[1],t_test_Tvcolor$estimate[1],t_test_Video$estimate[1],t_test_Microondas$estimate[1],t_test_Lavavajilla$estimate[1],t_test_Telefono$estimate[1]),
Grupo2=c(t_test_Automovil$estimate[2],t_test_Tvcolor$estimate[2],t_test_Video$estimate[2],t_test_Microondas$estimate[2],t_test_Lavavajilla$estimate[2],t_test_Telefono$estimate[2]),
# Aplica la función abs() para asegurarte de que las estadÃsticas t sean siempre positivas
Pruebas_t = c(abs(t_test_Automovil$statistic), abs(t_test_Tvcolor$statistic), abs(t_test_Video$statistic), abs(t_test_Microondas$statistic), abs(t_test_Lavavajilla$statistic), abs(t_test_Telefono$statistic)))
head(Resultados.ttest)%>% kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados", align = "c",digits = 2) %>% kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()| Variable | Grupo1 | Grupo2 | Pruebas_t |
|---|---|---|---|
| Automovil | 66.87 | 70.70 | 1.81 |
| TVcolor | 96.82 | 98.53 | 2.52 |
| Video | 57.68 | 63.47 | 1.19 |
| Microondas | 25.42 | 44.70 | 6.73 |
| Lavavajillas | 11.81 | 22.43 | 4.61 |
| Telefono | 80.71 | 90.23 | 3.51 |
Visualización de los resultados de un análisis de conglomerados
library(ggplot2)
mds_result <- cmdscale(matriz.dis.euclid, k = 2)
coordenadas_mds <- data.frame(Componente_principal_1 = mds_result[,1], Componente_principal_2 = mds_result[,2])
ciudades <- c("España", "Andalucia", "Aragon", "Asturias", "Balerares", "Canarias", "Cantabria",
"CyL", "CLM", "Cataluña", "Valenciana",
"Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")
coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion.cluster)
ggplot(coordenadas_mds, aes(x = Componente_principal_1, y = Componente_principal_2, color = Cluster)) +
geom_point(size = 3) +
geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +
labs(x = "Componente_principal_1", y = "Componente_principal_2",
title = "Visualizacion MDS") +
theme_minimal() +
theme(legend.position = "bottom")