A33- Aplicación: Análisis de Clúster (Conglomerados ) [Trabajo Grupal]
| INTEGRANTES | CARNET |
|---|---|
| Sandra Maribel Aparicio Fuentes | AF22025 |
| Nubia Linette Beltrán Hernández | BH21010 |
| Génesis Melissa Siguenza Rivas | SR22041 |
Datos
library(kableExtra)
library(readxl)
A33 <- read_excel("C:/Users/MINEDUCYT/Downloads/A33/A33.xlsx", col_types = c("text", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric"))
A33 %>%kable(caption = "DATOS", align = "c",digits = 2) %>% kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()| CC.AA. | Automovil | TVcolor | Video | Microondas | Lavavajillas | Telefono |
|---|---|---|---|---|---|---|
| España | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| Andalucía | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragón | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Baleares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
| Cantabria | 63.4 | 94.9 | 48.9 | 36.5 | 11.2 | 80.5 |
| Castilla y León | 65.8 | 97.1 | 47.7 | 28.1 | 14.0 | 85.0 |
| Cast.-La Mancha | 61.5 | 97.3 | 53.6 | 21.7 | 7.1 | 72.9 |
| Cataluña | 70.4 | 98.1 | 71.1 | 36.8 | 19.8 | 92.2 |
| Com. Valenciana | 72.7 | 98.4 | 68.2 | 26.6 | 12.1 | 84.4 |
| Extremadura | 60.5 | 97.7 | 43.7 | 20.7 | 11.7 | 67.1 |
| Galicia | 65.5 | 91.3 | 42.7 | 13.5 | 14.6 | 85.9 |
| Madrid | 74.0 | 99.4 | 76.3 | 53.9 | 32.3 | 95.7 |
| Murcia | 69.0 | 98.7 | 59.3 | 19.5 | 12.1 | 81.4 |
| Navarra | 76.4 | 99.3 | 60.6 | 44.0 | 20.6 | 87.4 |
| País Vasco | 71.3 | 98.3 | 61.6 | 45.7 | 23.7 | 94.3 |
| La Rioja | 64.9 | 98.6 | 54.4 | 44.4 | 17.6 | 83.4 |
Optención de outliers
Análisis de la existencia de outliers en la medida en que pueden generar importantes distorsiones en la detección del número de grupos.
datas <- data.matrix(A33[, -1])
mean <- colMeans(datas)
Sx <- cov(datas)
D2 <- mahalanobis(datas, mean, Sx)
p_value <- pchisq(D2, df = ncol(datas), lower.tail = FALSE)
tabla_mahalanobis <- data.frame(
CC.AA. = A33$CC.AA.,
D2 = D2,
p_value = p_value
)
tabla_mahalanobis %>% kable(caption = "D2 Y PVALUE", align = "c",digits = 2) %>% kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()| CC.AA. | D2 | p_value |
|---|---|---|
| España | 0.20 | 1.00 |
| Andalucía | 10.52 | 0.10 |
| Aragón | 1.91 | 0.93 |
| Asturias | 4.46 | 0.61 |
| Baleares | 5.70 | 0.46 |
| Canarias | 9.58 | 0.14 |
| Cantabria | 7.29 | 0.29 |
| Castilla y León | 2.21 | 0.90 |
| Cast.-La Mancha | 3.54 | 0.74 |
| Cataluña | 2.95 | 0.82 |
| Com. Valenciana | 2.65 | 0.85 |
| Extremadura | 10.43 | 0.11 |
| Galicia | 13.24 | 0.04 |
| Madrid | 8.31 | 0.22 |
| Murcia | 4.88 | 0.56 |
| Navarra | 7.65 | 0.26 |
| País Vasco | 2.32 | 0.89 |
| La Rioja | 4.17 | 0.65 |
Analisis de conglomerados jerarquicos
Realización de un análisis de conglomerados jerárquicos, evaluando la solución de distintos métodos de conglomeración, aplicando los criterios presentados para identificar el número adecuado de grupos y obtención de los centroides que han de servir de partida para el paso siguiente.
#Calculo distancia euclidea
matriz.dis.euclid.caso3<-dist(A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")], method="euclidean", diag = TRUE)
#cluster metodo average
hclust.average.caso3<-hclust(matriz.dis.euclid.caso3,method="average")
data.frame(hclust.average.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.339233 -1 3
## 5 9.387225 -16 -17
## 6 12.380548 1 5
## 7 12.448695 -9 -12
## 8 12.659790 -7 2
## 9 13.331971 -15 4
## 10 14.684330 -6 9
## 11 16.616569 -10 6
## 12 18.923865 -13 8
## 13 20.140313 7 12
## 14 22.109739 10 11
## 15 26.042489 13 14
## 16 31.452385 -2 15
## 17 39.104937 -14 16
hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
data.frame(hclust.ward.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.387225 -16 -17
## 5 9.853764 -1 3
## 6 12.448695 -9 -12
## 7 14.238563 -7 2
## 8 15.114011 -15 5
## 9 15.686087 -10 4
## 10 16.592408 -6 8
## 11 19.584858 1 9
## 12 22.402976 -13 7
## 13 27.124565 -2 10
## 14 27.214089 6 12
## 15 30.523870 -14 11
## 16 50.718899 13 14
## 17 70.785686 15 16
hclust.single.caso3<-hclust(matriz.dis.euclid.caso3,method="single")
data.frame(hclust.single.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 8.426150 -1 3
## 5 9.387225 -16 -17
## 6 9.497368 1 5
## 7 10.520932 -7 2
## 8 11.179445 -6 4
## 9 12.345039 -15 8
## 10 12.448695 -9 -12
## 11 12.449498 -10 9
## 12 12.568612 6 7
## 13 12.748333 11 12
## 14 13.884884 10 13
## 15 15.236469 -13 14
## 16 17.449069 -2 15
## 17 19.176809 -14 16
hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
data.frame(hclust.complete.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 9.387225 -16 -17
## 5 10.252317 -1 3
## 6 12.448695 -9 -12
## 7 14.598288 -15 5
## 8 14.798649 -7 2
## 9 14.940214 -10 4
## 10 17.063704 -6 7
## 11 21.199764 1 9
## 12 21.923731 -13 6
## 13 25.010598 8 12
## 14 25.558951 -2 10
## 15 31.948239 -14 11
## 16 43.448130 13 14
## 17 59.937134 15 16
hclust.centroid.caso3<-hclust(matriz.dis.euclid.caso3,method="centroid")
data.frame(hclust.centroid.caso3[2:1])## height merge.1 merge.2
## 1 6.874591 -3 -18
## 2 7.153321 -4 -8
## 3 7.805767 -5 -11
## 4 7.387792 -1 3
## 5 9.387225 -16 -17
## 6 8.315094 1 5
## 7 10.389278 -15 4
## 8 10.529321 -6 7
## 9 10.871460 -7 2
## 10 10.543548 8 9
## 11 11.542734 6 10
## 12 11.284691 -10 11
## 13 12.448695 -9 -12
## 14 16.023631 12 13
## 15 18.607642 -13 14
## 16 21.157165 -2 15
## 17 28.243987 -14 16
Segun los dendogramas observamos que los metodos mas parecidos entre si son los: ward, complete y average.
Ward
library (NbClust)
Datos.NbClust<-A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")]
res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15 , method = "ward.D2" , index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
library (NbClust)
Datos.NbClust<-A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")]
res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15 , method = "complete" , index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
library (NbClust)
Datos.NbClust<-A33[,c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono")]
res.wardD2<-NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15 , method = "average" , index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 5 proposed 4 as the best number of clusters
## * 3 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 15
##
##
## *******************************************************************
2 conglomerados
Centroides
es la media de las seir vatiables analizadas en cada uno de los grupos obtenidos.
library(dplyr)
library(kableExtra)
grupo.ward<-cutree(hclust.ward.caso3, k =2, h=NULL)
datos.caso3.grupos<-cbind(A33,grupo.ward)
datos.caso3.grupos$CC.AA. <- NULL
round(aggregate(datos.caso3.grupos,list(grupo.ward), mean ),2)->datos.caso3
head(datos.caso3)%>% kable(caption = "Grupos", align = "c",digits = 2) %>% kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()| Group.1 | Automovil | TVcolor | Video | Microondas | Lavavajillas | Telefono | grupo.ward |
|---|---|---|---|---|---|---|---|
| 1 | 66.87 | 96.82 | 57.68 | 25.42 | 11.81 | 80.71 | 1 |
| 2 | 70.70 | 98.53 | 63.47 | 44.70 | 22.43 | 90.23 | 2 |
Análisis de conglomerados no jerárquico mediante el método de k-medias.
Realización de un análisis de conglomerados no jerárquico mediante el método de k-medias para la obtención de una solución óptima en términos de homogeneidad intrasegmentos y heterogeneidad intersegmentos.
Estimas el analisis de conglomerados no jerarquicos tomando como centroides iniciales los anteriores.
library(stats)
c1<-c(66.87,96.82,56.01 ,25.43,11.81,80.71)
c2<-c(70.70,98.53,63.47,44.70,22.43,90.23)
solucion <- kmeans(datas,rbind(c1,c2))
solucion## K-means clustering with 2 clusters of sizes 12, 6
##
## Cluster means:
## Automovil TVcolor Video Microondas Lavavajillas Telefono
## 1 66.86667 96.81667 57.67500 25.425 11.80833 80.70833
## 2 70.70000 98.53333 63.46667 44.700 22.43333 90.23333
##
## Clustering vector:
## [1] 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2810.6467 848.3533
## (between_SS / total_SS = 40.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
coincide la solucion del jerarquico con la del no jerarquico.
Dendogramas
hclust.ward.caso3<-hclust(matriz.dis.euclid.caso3,method="ward.D2")
plot(hclust.ward.caso3,labels=A33$CC.AA.)
#Para ponerlo por grupos
grupos <- cutree(hclust.ward.caso3, k = 2)
rect.hclust(hclust.ward.caso3, k = 2, border = "red")hclust.complete.caso3<-hclust(matriz.dis.euclid.caso3,method="complete")
plot(hclust.complete.caso3,labels=A33$CC.AA.)
#Para ponerlo por grupos
grupos <- cutree(hclust.complete.caso3, k = 2)
rect.hclust(hclust.complete.caso3, k = 2, border = "red")hclust.average.caso3<-hclust(matriz.dis.euclid.caso3,method="average")
plot(hclust.average.caso3,labels=A33$CC.AA.)
# Para ponerlo por grupos
grupos <- cutree(hclust.average.caso3, k = 2)
rect.hclust(hclust.average.caso3, k = 2, border = "red")Prueba t para cada variable independiente
Para saber cuales son significativamentediferentes entre los grupos
library(dplyr)
library(kableExtra)
solucion.cluster<-solucion$cluster
t1<-t.test(Automovil~solucion.cluster, data = datas)
t2<-t.test(TVcolor~solucion.cluster, data = datas)
t3<-t.test(Video~solucion.cluster, data = datas)
t4<-t.test(Microondas~solucion.cluster, data = datas)
t5<-t.test(Lavavajillas~solucion.cluster, data = datas)
t6<-t.test(Telefono~solucion.cluster, data = datas)
resultados.ttest <- data.frame(
Variable = c("Automovil", "TVcolor", "Video", "Microondas", "Lavavajillas", "Telefono"),
Pruebast = c(t1$statistic, t2$statistic, t3$statistic, t4$statistic, t5$statistic, t6$statistic),
Grupo1=c(t1$estimate[1],t2$estimate[1],t3$estimate[1],t4$estimate[1],t5$estimate[1],t6$estimate[1]),
Grupo2=c(t1$estimate[2],t2$estimate[2],t3$estimate[2],t4$estimate[2],t5$estimate[2],t6$estimate[2])
)
head(resultados.ttest)%>% kable(caption = "Significatividad de las diferencias entre los perfiles de los conglomerados", align = "c",digits = 2) %>% kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()| Variable | Pruebast | Grupo1 | Grupo2 |
|---|---|---|---|
| Automovil | -1.81 | 66.87 | 70.70 |
| TVcolor | -2.52 | 96.82 | 98.53 |
| Video | -1.19 | 57.68 | 63.47 |
| Microondas | -6.73 | 25.42 | 44.70 |
| Lavavajillas | -4.61 | 11.81 | 22.43 |
| Telefono | -3.51 | 80.71 | 90.23 |
Se observa que para Automovil el primer grupo presenta el 66.87% y el grupo 2 un 70.70%. Pero no es muy significativa por que t=-1.81, p > 0.05. de esta manera se analizan las demas variables.
Se concluye que el grupo 2 se corresponde con counidades autonomas donde los equipamienos son significativos a comparacion con el grupo 1.
Visualizacion
library(cluster)
library(factoextra)
mydata <- data.frame(datas, solucion$cluster)
# Cambiar el nombre de la columna de clusters
colnames(mydata)[ncol(mydata)] <- "Cluster"
# Visualizar el resultado
clusplot(mydata[,1:(ncol(mydata)-1)],
mydata$Cluster,
color = TRUE,
shade = TRUE,
labels = 2,
lines = 0,
main = "Clusters K-means",
xlab = "Componente 1",
ylab = "Componente 2")#Grafico mejorado con factoextra
fviz_cluster(solucion, data = datas,
palette = c("#E24229", "#29ABE2"),
geom = c("point", "text"),
ggtheme = theme_bw())