Tarea A32: Aplicación de Análisis de Clúster (Conglomerados)
Marcela Guadalupe Hernandez Montoya HM21074
A32 - Aplicacion de Analisis de Cluster.
Indicaciones: Del texto: Joaquin, A. M., & Ezequiel, U. J. (2017). Análisis multivariante aplicado con R. 2a ed. Ediciones Paraninfo, S.A. Desarrollen el ejemplo 3.7, de la página 114, reproduce todas las salidas.
Desarrollo de tabla
library(dplyr)
library(knitr)
library(kableExtra)
data <- tibble::tribble(
~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
"Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
"Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
"Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
"Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
"Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
"Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
"Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
"Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
"Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
"Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
"Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
"Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
"Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
"Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
"Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
"Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
"Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
"La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)
data %>% kbl(caption = "Tabla de datos", format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| CC.AA. | Automovil | TV-Color | Video | Microondas | Lavajillas | Telefono |
|---|---|---|---|---|---|---|
| Espana | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| Andalucia | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragon | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Balerares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
| Cantabria | 63.4 | 94.9 | 48.9 | 36.5 | 11.2 | 80.5 |
| Castilla y Leon | 65.8 | 97.1 | 47.7 | 28.1 | 14.0 | 85.0 |
| Cast.-La Mancha | 61.5 | 97.3 | 53.6 | 21.7 | 7.1 | 72.9 |
| Cataluna | 70.4 | 98.1 | 71.1 | 36.8 | 19.8 | 92.2 |
| Com.-Valenciana | 72.7 | 98.4 | 68.2 | 26.6 | 12.1 | 84.4 |
| Extremadura | 60.5 | 97.7 | 43.7 | 20.7 | 11.7 | 67.1 |
| Galicia | 65.5 | 91.3 | 42.7 | 13.5 | 14.6 | 85.9 |
| Madrid | 74.0 | 99.4 | 76.3 | 53.9 | 32.3 | 95.7 |
| Murcia | 69.0 | 98.7 | 59.3 | 19.5 | 12.1 | 81.4 |
| Navarra | 76.4 | 99.3 | 60.6 | 44.0 | 20.6 | 87.4 |
| Pais Vasco | 71.3 | 98.3 | 61.6 | 45.7 | 23.7 | 94.3 |
| La Rioja | 64.9 | 98.6 | 54.4 | 44.4 | 17.6 | 83.4 |
## Warning: package 'MASS' was built under R version 4.4.3
##
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'ggplot2' was built under R version 4.4.3
data_numeric <- data %>% select_if(is.numeric)
mahalanobis_dist <- mahalanobis(
x = data_numeric,
center = colMeans(data_numeric, na.rm = TRUE),
cov = cov(data_numeric, use = "pairwise.complete.obs")
)
p_value <- 0.001
threshold <- qchisq(1 - p_value, df = ncol(data_numeric))
data <- data %>% mutate(Mahalanobis = mahalanobis_dist,
Outlier = ifelse(Mahalanobis > threshold, "Si", "No"))
ggplot(data, aes(x = seq_along(Mahalanobis), y = Mahalanobis, color = Outlier)) +
geom_point(size = 3) +
geom_hline(yintercept = threshold, linetype = "dashed", color = "purple") +
labs(
title = "Deteccion de Outliers mediante la Distancia de Mahalanobis",
x = "Observaciones",
y = "Distancia de Mahalanobis"
) +
theme_minimal()
## Realizacion de Dendogramas.
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
dist_matrix <- dist(scale(data_numeric))
# Método Ward
hc_ward <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward, main = "Metodo Ward", cex = 0.8)## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
### Metodo Complete
hc_complete <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete, main = "Metodo Complete", cex = 0.8)hc_average <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average, main = "Metodo Average", cex = 0.8)Generacion de los Indices
library(NbClust)
Datos.NbClust <- data[,c("Automovil", "TV-Color", "Video", "Microondas", "Lavajillas", "Telefono")]
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15, method = "ward.D2", index = "alllong")## Warning in pf(beale, pp, df2): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in pf(beale, pp, df2): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in pf(beale, pp, df2): Se han producido NaNs
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Centroides
## Warning: package 'cluster' was built under R version 4.4.3
hc_ward <- hclust(dist_matrix, method = "ward.D2")
grupo.ward <- cutree(hc_ward, k = 2, h = NULL)
datos.caso3.grupos <- cbind(Datos.NbClust, grupo.ward)
datos.caso3.grupos$id <- NULL
round(aggregate(datos.caso3.grupos, list(grupo.ward), mean ), 2)## Group.1 Automovil TV-Color Video Microondas Lavajillas Telefono grupo.ward
## 1 1 70.52 98.29 65.35 35.70 17.03 85.87 1
## 2 2 63.40 95.58 48.12 24.15 11.98 79.92 2
datos.caso3.grupos.kmeans <- data_numeric
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)
solucion <- kmeans(datos.caso3.grupos.kmeans, rbind(c1, c2))
# Centroides finales
centroides_fin <- solucion$centers
# Visualización de los resultados
kable(centroides_fin, caption = "Centroides Finales de los Clusters")| Automovil | TV-Color | Video | Microondas | Lavajillas | Telefono |
|---|---|---|---|---|---|
| 66.86667 | 96.81667 | 57.67500 | 25.425 | 11.80833 | 80.70833 |
| 70.70000 | 98.53333 | 63.46667 | 44.700 | 22.43333 | 90.23333 |
Metodo Ward
hc_ward_2 <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward_2, k = 2, cex = 0.7,
main = "Dendrograma - Método Ward",
rect = TRUE, rect_fill = FALSE,
rect_border = "purple")
### Metodo de averege
hc_average_2 <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average_2, k = 2, cex = 0.7,
main = "Dendrograma - Método Average",
rect = TRUE, rect_fill = FALSE,
rect_border = "purple")hc_complete_2 <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete_2, k = 2, cex = 0.7,
main = "Dendrograma - Método Complete",
rect = TRUE, rect_fill = FALSE,
rect_border = "purple")data_cluster <- as.factor(solucion$cluster)
resultados_t <- data.frame(
Variable = character(),
Grupo_1 = numeric(),
Grupo_2 = numeric(),
Prueba_t = numeric(),
stringsAsFactors = FALSE
)
for (variable in colnames(datos.caso3.grupos.kmeans)) {
grupo1 <- datos.caso3.grupos.kmeans[data_cluster == 1, variable, drop = TRUE]
grupo2 <- datos.caso3.grupos.kmeans[data_cluster == 2, variable, drop = TRUE]
prueba_t <- t.test(grupo1, grupo2)
resultados_t <- rbind(resultados_t, data.frame(
Variable = variable,
Grupo_1_Media = mean(grupo1, na.rm = TRUE),
Grupo_2_Media = mean(grupo2, na.rm = TRUE),
Prueba_t = prueba_t$p.value
))
}
kable(resultados_t, caption = "Resultados de la Prueba t para Cada Variable")| Variable | Grupo_1_Media | Grupo_2_Media | Prueba_t |
|---|---|---|---|
| Automovil | 66.86667 | 70.70000 | 0.1000345 |
| TV-Color | 96.81667 | 98.53333 | 0.0238242 |
| Video | 57.67500 | 63.46667 | 0.2548580 |
| Microondas | 25.42500 | 44.70000 | 0.0000283 |
| Lavajillas | 11.80833 | 22.43333 | 0.0026595 |
| Telefono | 80.70833 | 90.23333 | 0.0034967 |