Tarea A32: Aplicación de Análisis de Clúster (Conglomerados)

Marcela Guadalupe Hernandez Montoya HM21074

A32 - Aplicacion de Analisis de Cluster.

Indicaciones: Del texto: Joaquin, A. M., & Ezequiel, U. J. (2017). Análisis multivariante aplicado con R. 2a ed. Ediciones Paraninfo, S.A. Desarrollen el ejemplo 3.7, de la página 114, reproduce todas las salidas.

Desarrollo de tabla

library(dplyr)
library(knitr)
library(kableExtra)

data <- tibble::tribble(
  ~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
  "Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
  "Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
  "Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
  "Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
  "Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
  "Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
  "Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
  "Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
  "Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
  "Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
  "Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
  "Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
  "Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
  "Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
  "Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
  "Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
  "Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
  "La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)

data %>% kbl(caption = "Tabla de datos", format = "html")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Tabla de datos
CC.AA.	Automovil	TV-Color	Video	Microondas	Lavajillas	Telefono
Espana	69.0	97.6	62.4	32.3	17.0	85.2
Andalucia	66.7	98.0	82.7	24.1	12.7	74.7
Aragon	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Balerares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y Leon	65.8	97.1	47.7	28.1	14.0	85.0
Cast.-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluna	70.4	98.1	71.1	36.8	19.8	92.2
Com.-Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
Pais Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.3

## 
## Adjuntando el paquete: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

data_numeric <- data %>% select_if(is.numeric)

mahalanobis_dist <- mahalanobis(
  x = data_numeric,
  center = colMeans(data_numeric, na.rm = TRUE),
  cov = cov(data_numeric, use = "pairwise.complete.obs")
)
p_value <- 0.001
threshold <- qchisq(1 - p_value, df = ncol(data_numeric))

data <- data %>% mutate(Mahalanobis = mahalanobis_dist,
                        Outlier = ifelse(Mahalanobis > threshold, "Si", "No"))

ggplot(data, aes(x = seq_along(Mahalanobis), y = Mahalanobis, color = Outlier)) +
  geom_point(size = 3) +
  geom_hline(yintercept = threshold, linetype = "dashed", color = "purple") +
  labs(
    title = "Deteccion de Outliers mediante la Distancia de Mahalanobis",
    x = "Observaciones",
    y = "Distancia de Mahalanobis"
  ) +
  theme_minimal()

## Realizacion de Dendogramas.

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

dist_matrix <- dist(scale(data_numeric))

# Método Ward
hc_ward <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward, main = "Metodo Ward", cex = 0.8)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

### Metodo Complete

hc_complete <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete, main = "Metodo Complete", cex = 0.8)

hc_average <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average, main = "Metodo Average", cex = 0.8)

Generacion de los Indices

library(NbClust)
Datos.NbClust <- data[,c("Automovil", "TV-Color", "Video", "Microondas", "Lavajillas", "Telefono")]
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15, method = "ward.D2", index = "alllong")

## Warning in pf(beale, pp, df2): Se han producido NaNs

## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs

## Warning in pf(beale, pp, df2): Se han producido NaNs

## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs

## Warning in pf(beale, pp, df2): Se han producido NaNs

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides

library(cluster)

## Warning: package 'cluster' was built under R version 4.4.3

hc_ward <- hclust(dist_matrix, method = "ward.D2")
grupo.ward <- cutree(hc_ward, k = 2, h = NULL)
datos.caso3.grupos <- cbind(Datos.NbClust, grupo.ward)
datos.caso3.grupos$id <- NULL

round(aggregate(datos.caso3.grupos, list(grupo.ward), mean ), 2)

##   Group.1 Automovil TV-Color Video Microondas Lavajillas Telefono grupo.ward
## 1       1     70.52    98.29 65.35      35.70      17.03    85.87          1
## 2       2     63.40    95.58 48.12      24.15      11.98    79.92          2

datos.caso3.grupos.kmeans <- data_numeric
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)
solucion <- kmeans(datos.caso3.grupos.kmeans, rbind(c1, c2))
# Centroides finales
centroides_fin <- solucion$centers
# Visualización de los resultados
kable(centroides_fin, caption = "Centroides Finales de los Clusters")

Centroides Finales de los Clusters
Automovil	TV-Color	Video	Microondas	Lavajillas	Telefono
66.86667	96.81667	57.67500	25.425	11.80833	80.70833
70.70000	98.53333	63.46667	44.700	22.43333	90.23333

Metodo Ward

hc_ward_2 <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Ward", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "purple")

### Metodo de averege

hc_average_2 <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Average", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "purple")

hc_complete_2 <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Complete", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "purple")

data_cluster <- as.factor(solucion$cluster)

resultados_t <- data.frame(
  Variable = character(),
  Grupo_1 = numeric(),
  Grupo_2 = numeric(),
  Prueba_t = numeric(),
  stringsAsFactors = FALSE
)

for (variable in colnames(datos.caso3.grupos.kmeans)) {
  grupo1 <- datos.caso3.grupos.kmeans[data_cluster == 1, variable, drop = TRUE]
  grupo2 <- datos.caso3.grupos.kmeans[data_cluster == 2, variable, drop = TRUE]
  prueba_t <- t.test(grupo1, grupo2)
  
  resultados_t <- rbind(resultados_t, data.frame(
    Variable = variable,
    Grupo_1_Media = mean(grupo1, na.rm = TRUE),
    Grupo_2_Media = mean(grupo2, na.rm = TRUE),
    Prueba_t = prueba_t$p.value
  ))
}

kable(resultados_t, caption = "Resultados de la Prueba t para Cada Variable")

Resultados de la Prueba t para Cada Variable
Variable	Grupo_1_Media	Grupo_2_Media	Prueba_t
Automovil	66.86667	70.70000	0.1000345
TV-Color	96.81667	98.53333	0.0238242
Video	57.67500	63.46667	0.2548580
Microondas	25.42500	44.70000	0.0000283
Lavajillas	11.80833	22.43333	0.0026595
Telefono	80.70833	90.23333	0.0034967