Tarea A-32 APLICACION DE CONGLOMERADOS

1. Cargando el dataframe que aparece en el libro

library(gt)
library(dplyr)
# Crear el datafram

DF <- data.frame(
  CCAA = c("EspaƱa", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", 
           "Cantabria", "Castilla y León", "Cast.-La Mancha", "Cataluña", 
           "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", 
           "Navarra", "PaĆ­s Vasco", "La Rioja"),
  Automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 
                72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
  TV_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 
               98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
  Video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 
            68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
  Microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 
                 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
  Lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8, 
                   12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
  Telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 
               84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)

df <- scale(DF[, -1]) %>% as.data.frame()

gt(DF) %>%
  tab_header(
    title = "Equipamiento de los hogares en distintas comunidades autónomas"
  ) %>%
  tab_spanner(
    label = "Porcentaje de hogares que poseen",
    columns = c(Automovil, TV_color, Video, Microondas, Lavavajillas, Telefono)
  ) %>%
  cols_label(
    CCAA = "CC.AA.",
    Automovil = "Automovil",
    TV_color = "TV color",
    Video = "Video",
    Microondas = "Micro-ondas",
    Lavavajillas = "Lava-vajillas",
    Telefono = "TelƩ-fono"
  ) %>%
  fmt_number(columns = 2:7, decimals = 1) %>%
  tab_source_note(
    source_note = "Fuente: Panel de Hogares de la Unión Europea"
  )
Equipamiento de los hogares en distintas comunidades autónomas
CC.AA.
Porcentaje de hogares que poseen
Automovil TV color Video Micro-ondas Lava-vajillas TelƩ-fono
EspaƱa 69.0 97.6 62.4 32.3 17.0 85.2
Andalucia 66.7 98.0 82.7 24.1 12.7 74.7
Aragon 67.2 97.5 56.8 43.4 20.6 88.4
Asturias 63.7 95.2 52.1 24.4 13.3 88.1
Baleares 71.9 98.8 62.4 29.8 10.1 87.9
Canarias 72.7 96.8 68.4 27.9 5.8 75.4
Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
Castilla y León 65.8 97.1 47.7 28.1 14.0 85.0
Cast.-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
CataluƱa 70.4 98.1 71.1 36.8 19.8 92.2
Com. Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
Galicia 65.5 91.3 42.7 13.5 14.6 85.9
Madrid 74.0 99.4 76.3 53.9 32.3 95.7
Murcia 69.0 98.7 59.3 19.5 12.1 81.4
Navarra 76.4 99.3 60.6 44.0 20.6 87.4
PaĆ­s Vasco 71.3 98.3 61.6 45.7 23.7 94.3
La Rioja 64.9 98.6 54.4 44.4 17.6 83.4
Fuente: Panel de Hogares de la Unión Europea

2. Realizando el analisis de deteccion de outliers para el dataframe.

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Realizando la medicion de distancias usando mhalonobis, para deteccion de datos atipicos
mean <- colMeans(df) %>% print()
##     Automovil      TV_color         Video    Microondas  Lavavajillas 
##  7.864080e-17  1.224329e-15  1.353084e-16 -1.842662e-16  3.469447e-17 
##      Telefono 
## -5.389208e-16
Sx <- cov(df) %>% print()
##              Automovil  TV_color     Video Microondas Lavavajillas  Telefono
## Automovil    1.0000000 0.4844213 0.6369319  0.4806715    0.4251273 0.5687918
## TV_color     0.4844213 1.0000000 0.5681779  0.5184366    0.3176238 0.1232964
## Video        0.6369319 0.5681779 1.0000000  0.3451750    0.2892703 0.2352951
## Microondas   0.4806715 0.5184366 0.3451750  1.0000000    0.7799216 0.6167669
## Lavavajillas 0.4251273 0.3176238 0.2892703  0.7799216    1.0000000 0.7484758
## Telefono     0.5687918 0.1232964 0.2352951  0.6167669    0.7484758 1.0000000
D2 <- mahalanobis(df, mean, Sx, inverted = FALSE) %>% print()
##  [1]  0.1957042 10.5207676  1.9097471  4.4628800  5.7026801  9.5791345
##  [7]  7.2910646  2.2064805  3.5390966  2.9473889  2.6469747 10.4333455
## [13] 13.2415547  8.3056697  4.8751556  7.6517494  2.3173225  4.1732837
gl <- ncol(df) %>% print()
## [1] 6
print(D2)
##  [1]  0.1957042 10.5207676  1.9097471  4.4628800  5.7026801  9.5791345
##  [7]  7.2910646  2.2064805  3.5390966  2.9473889  2.6469747 10.4333455
## [13] 13.2415547  8.3056697  4.8751556  7.6517494  2.3173225  4.1732837
valor <-pchisq(D2, df=6, lower.tail = FALSE) %>% print
##  [1] 0.99985487 0.10436583 0.92780908 0.61429592 0.45730614 0.14353133
##  [7] 0.29476644 0.89976287 0.73876101 0.81542054 0.85166984 0.10754949
## [13] 0.03935644 0.21655352 0.55992235 0.26474304 0.88832587 0.65323937
# Identificar valores atipicos (α = 0.05)
es_outlier <- D2 > qchisq(0.95, df = gl) %>% print()
## [1] 12.59159
# Cuadro de resultados con las comunidades autonomas
resultados_completos <- data.frame(
  comunidades = DF$CCAA,
  D_cuadrado = D2,
  Valor_P = valor,
  Es_Outlier = ifelse(es_outlier, "SĆ­", "No")
) 


kable(resultados_completos, 
      caption = "AnÔlisis de Distancias de Mahalanobis por Comunidad Autónoma",
      digits = c(0, 3, 4, 0), 
      col.names = c("CC.AA.", "D²", "Valor P", "Outlier")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(which(resultados_completos$Es_Outlier == "SĆ­"), 
           background = "#FFE6E6", bold = TRUE) %>%
  add_header_above(c(" " = 1, "Distancia de Mahalanobis" = 3))
AnÔlisis de Distancias de Mahalanobis por Comunidad Autónoma
Distancia de Mahalanobis
CC.AA. D² Valor P Outlier
EspaƱa 0.196 0.9999 No
Andalucia 10.521 0.1044 No
Aragon 1.910 0.9278 No
Asturias 4.463 0.6143 No
Baleares 5.703 0.4573 No
Canarias 9.579 0.1435 No
Cantabria 7.291 0.2948 No
Castilla y León 2.206 0.8998 No
Cast.-La Mancha 3.539 0.7388 No
CataluƱa 2.947 0.8154 No
Com. Valenciana 2.647 0.8517 No
Extremadura 10.433 0.1075 No
Galicia 13.242 0.0394 SĆ­
Madrid 8.306 0.2166 No
Murcia 4.875 0.5599 No
Navarra 7.652 0.2647 No
PaĆ­s Vasco 2.317 0.8883 No
La Rioja 4.173 0.6532 No

3. Realizando dendogramas de todos los disintos metodos de conglomeracion.

3.1 Metodo ā€œeuclideanā€

# Distancias entre las observaciones

matriz.dis.euclid.caso3 <- dist(DF[, c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono")], method = "euclidean", diag = TRUE)
# metodo "ward"
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")

# GrƔfico
ward <- plot(hclust.ward.caso3, labels = DF$CCAA)

3.2 Metodo ā€œaverageā€

# Metodo "average"
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")

# Grafica
average <- plot(hclust.average.caso3, labels = DF$CCAA)

3.3 Metodo ā€œcompleteā€

# mƩtodo "complete"
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")

# GrƔfica 
complete <- plot(hclust.complete.caso3, labels = DF$CCAA)

3.4 Metodo ā€œsingleā€

# mƩtodo "single"
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3, method = "single")

# GrƔfica 
single <- plot(hclust.single.caso3, labels = DF$CCAA)

3.5 metodo ā€œcentroidā€

# mƩtodo centroid
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3, method = "centroid")

# GrƔfica 
plot(hclust.centroid.caso3, labels = DF$CCAA)

Se realiza el proceso para calcular el numero de cluster utilizando los metodos cuyos dedogramas no estwn distosionados y menos dudas generen; en este caso para ā€œwardā€ y ā€œcompleteā€

4. Generando los indices para los distintos metodos

4.1 usando ā€œWard.D2ā€

library(NbClust)
Datos.NbClust <- DF[, c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono")]

res.wardD2 <- NbClust (Datos.NbClust, distance = "euclidean" , min.nc=2, max.nc = 15, method = "ward.D2", index = "alllong" )
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

4.2 usando ā€œCompleteā€

res.complete <- NbClust (Datos.NbClust, distance = "euclidean" , min.nc=2, max.nc = 15, method = "complete" , index = "alllong")
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

4.3 Visualizando los graficos diviendolo em los grupos encontrados

plot(hclust.ward.caso3, labels = DF$CCAA)

# Dividir las observaciones en 2 grupos basƔndose en el dendrograma
# `cutree()` asigna cada observación a uno de los 2 grupos formados al cortar el Dendrograma
groups <- cutree(hclust.ward.caso3, k = 3)

# Dibujar rectƔngulos en el dendrograma alrededor de los dos grupos formados
# `rect.hclust()` destaca visualmente los clĆŗsteres en el dendrograma
# `k = 2` indica que se deben dibujar rectƔngulos alrededor de 2 grupos
# `border = "red"` especifica que los rectƔngulos deben tener un borde rojo
rect.hclust(hclust.ward.caso3, k = 3, border = "green")

#Realizar agrupamiento jerƔrquico utilizando el mƩtodo de enlace completo ("complete linkage")
# `matriz.dis.euclid.caso3` es la matriz de distancias euclidianas entre las observaciones
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")

# Graficar el dendrograma resultante del agrupamiento jerƔrquico
# Las etiquetas (`labels`) corresponden a la columna `ce.AA.` del conjunto de datos original
plot(hclust.complete.caso3, labels = DF$CCAA)

# Dividir las observaciones en 2 grupos basƔndose en el dendrograma
# `cutree()` asigna cada observación a uno de los 2 grupos formados al cortar el dendrograma
groups <- cutree(hclust.complete.caso3, k = 3)

# Dibujar rectƔngulos en el dendrograma alrededor de los dos grupos formados
# `rect.hclust()` añade rectÔngulos para resaltar visualmente los clústeres
# `k = 2` indica que se deben formar 2 grupos
# `border = "red"` especifica que el borde de los rectƔngulos debe ser rojo
rect.hclust(hclust.complete.caso3, k = 3, border = "green")

5. obteniendo los centroides para el metodo ward.d2

grupo.ward <- cutree(hclust.ward.caso3, k = 3, h = NULL)

# Combinar
datos.caso3.grupos <- cbind(DF, grupo.ward)

datos.caso3.grupos$CCAA <- NULL
round(aggregate(datos.caso3.grupos, list(grupo.ward), mean), 2) ->datos.caso3
print(datos.caso3)
##   Group.1 Automovil TV_color Video Microondas Lavavajillas Telefono grupo.ward
## 1       1     70.33    98.05 67.23      26.70        11.63    81.50          1
## 2       2     70.70    98.53 63.47      44.70        22.43    90.23          2
## 3       3     63.40    95.58 48.12      24.15        11.98    79.92          3

Obtuvimos la media de las seies variables analizados en cada uno de los tres conglomerdos que el metodo WARD.D2 obtuvo.

6. Usando los centroides utilizando metodos no jerarquicos

6.1 obteniendo los centroides en vectores y ejecutando K-means

library(dplyr)
datos_means <- DF[,-1] 
c1 <- filter(datos.caso3, Group.1 == 1)%>% dplyr::select(Automovil,TV_color,Video, Microondas,Lavavajillas,Telefono)

c2 <- filter(datos.caso3, Group.1 == 2) %>% dplyr::select(Automovil,TV_color,Video, Microondas, Lavavajillas, Telefono)

c3 <- filter(datos.caso3, Group.1 == 3) %>% dplyr::select (Automovil, TV_color, Video, Microondas, Lavavajillas, Telefono)
km <- kmeans(datos_means, centers = rbind(c1, c2, c3)) |> print( )
## K-means clustering with 3 clusters of sizes 6, 6, 6
## 
## Cluster means:
##   Automovil TV_color    Video Microondas Lavavajillas Telefono
## 1  70.33333 98.05000 67.23333      26.70     11.63333 81.50000
## 2  70.70000 98.53333 63.46667      44.70     22.43333 90.23333
## 3  63.40000 95.58333 48.11667      24.15     11.98333 79.91667
## 
## Clustering vector:
##  [1] 1 1 2 3 1 1 3 3 3 2 1 3 3 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 698.7550 848.3533 825.6883
##  (between_SS / total_SS =  61.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

7. Visualizamos el agrupamiento utilizando la respuesta de kmeans (descomponiendo en componentes principales)

pca <- prcomp(df, scale. = TRUE)

nombres <- (DF$CCAA)
pca_data <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  Cluster = as.factor(km$cluster)
)

library(ggplot2)
library(ggrepel) 

ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster, label = nombres)) +
  geom_point(size = 3) +
  geom_text_repel(size = 3, segment.color =  NA) +
  labs(title = "Clusters de k-means usando PCA",
       x = "Componente Principal 1",
       y = "Componente Principal 2") +
  theme_minimal()

8. realizando Prueba ANOVA de un factor con cada cluster como factor y cada componente principal como variable dependiente.

anova_pc1 <- aov(PC1 ~ Cluster, data = pca_data)
summary(anova_pc1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Cluster      2  44.54  22.269   24.96 1.69e-05 ***
## Residuals   15  13.38   0.892                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_pc2 <- aov(PC2 ~ Cluster, data = pca_data)
summary(anova_pc2)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Cluster      2 11.465   5.733   8.914 0.00281 **
## Residuals   15  9.647   0.643                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova_pc1)  
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = PC1 ~ Cluster, data = pca_data)
## 
## $Cluster
##          diff        lwr        upr     p adj
## 2-1 -1.965921 -3.3822917 -0.5495504 0.0068938
## 3-1  1.886901  0.4705307  3.3032720 0.0092154
## 3-2  3.852822  2.4364517  5.2691930 0.0000109
TukeyHSD(anova_pc2)  
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = PC2 ~ Cluster, data = pca_data)
## 
## $Cluster
##          diff        lwr      upr     p adj
## 2-1 1.5852105  0.3825521 2.787869 0.0099169
## 3-1 1.7834042  0.5807458 2.986063 0.0042061
## 3-2 0.1981937 -1.0044647 1.400852 0.9045637

Interpretacion:

Comparación PC1 PC2 Significativo
2 vs 1 0.0069 0.0099 Si para ambos
3 vs 1 0.0092 0.0042 Si para ambos
3 vs 2 <0.0001 0.9046 Solo en PC1