Tarea A-32 APLICACION DE CONGLOMERADOS
1. Cargando el dataframe que aparece en el libro
library(gt)
library(dplyr)
# Crear el datafram
DF <- data.frame(
CCAA = c("EspaƱa", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias",
"Cantabria", "Castilla y León", "Cast.-La Mancha", "Cataluña",
"Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
"Navarra", "PaĆs Vasco", "La Rioja"),
Automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4,
72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
TV_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1,
98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
Video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1,
68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
Microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8,
26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
Lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8,
12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
Telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2,
84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
df <- scale(DF[, -1]) %>% as.data.frame()
gt(DF) %>%
tab_header(
title = "Equipamiento de los hogares en distintas comunidades autónomas"
) %>%
tab_spanner(
label = "Porcentaje de hogares que poseen",
columns = c(Automovil, TV_color, Video, Microondas, Lavavajillas, Telefono)
) %>%
cols_label(
CCAA = "CC.AA.",
Automovil = "Automovil",
TV_color = "TV color",
Video = "Video",
Microondas = "Micro-ondas",
Lavavajillas = "Lava-vajillas",
Telefono = "TelƩ-fono"
) %>%
fmt_number(columns = 2:7, decimals = 1) %>%
tab_source_note(
source_note = "Fuente: Panel de Hogares de la Unión Europea"
)| Equipamiento de los hogares en distintas comunidades autónomas | ||||||
| CC.AA. |
Porcentaje de hogares que poseen
|
|||||
|---|---|---|---|---|---|---|
| Automovil | TV color | Video | Micro-ondas | Lava-vajillas | TelƩ-fono | |
| EspaƱa | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| Andalucia | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragon | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Baleares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
| Cantabria | 63.4 | 94.9 | 48.9 | 36.5 | 11.2 | 80.5 |
| Castilla y León | 65.8 | 97.1 | 47.7 | 28.1 | 14.0 | 85.0 |
| Cast.-La Mancha | 61.5 | 97.3 | 53.6 | 21.7 | 7.1 | 72.9 |
| CataluƱa | 70.4 | 98.1 | 71.1 | 36.8 | 19.8 | 92.2 |
| Com. Valenciana | 72.7 | 98.4 | 68.2 | 26.6 | 12.1 | 84.4 |
| Extremadura | 60.5 | 97.7 | 43.7 | 20.7 | 11.7 | 67.1 |
| Galicia | 65.5 | 91.3 | 42.7 | 13.5 | 14.6 | 85.9 |
| Madrid | 74.0 | 99.4 | 76.3 | 53.9 | 32.3 | 95.7 |
| Murcia | 69.0 | 98.7 | 59.3 | 19.5 | 12.1 | 81.4 |
| Navarra | 76.4 | 99.3 | 60.6 | 44.0 | 20.6 | 87.4 |
| PaĆs Vasco | 71.3 | 98.3 | 61.6 | 45.7 | 23.7 | 94.3 |
| La Rioja | 64.9 | 98.6 | 54.4 | 44.4 | 17.6 | 83.4 |
| Fuente: Panel de Hogares de la Unión Europea | ||||||
2. Realizando el analisis de deteccion de outliers para el dataframe.
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Realizando la medicion de distancias usando mhalonobis, para deteccion de datos atipicos
mean <- colMeans(df) %>% print()## Automovil TV_color Video Microondas Lavavajillas
## 7.864080e-17 1.224329e-15 1.353084e-16 -1.842662e-16 3.469447e-17
## Telefono
## -5.389208e-16
## Automovil TV_color Video Microondas Lavavajillas Telefono
## Automovil 1.0000000 0.4844213 0.6369319 0.4806715 0.4251273 0.5687918
## TV_color 0.4844213 1.0000000 0.5681779 0.5184366 0.3176238 0.1232964
## Video 0.6369319 0.5681779 1.0000000 0.3451750 0.2892703 0.2352951
## Microondas 0.4806715 0.5184366 0.3451750 1.0000000 0.7799216 0.6167669
## Lavavajillas 0.4251273 0.3176238 0.2892703 0.7799216 1.0000000 0.7484758
## Telefono 0.5687918 0.1232964 0.2352951 0.6167669 0.7484758 1.0000000
## [1] 0.1957042 10.5207676 1.9097471 4.4628800 5.7026801 9.5791345
## [7] 7.2910646 2.2064805 3.5390966 2.9473889 2.6469747 10.4333455
## [13] 13.2415547 8.3056697 4.8751556 7.6517494 2.3173225 4.1732837
## [1] 6
## [1] 0.1957042 10.5207676 1.9097471 4.4628800 5.7026801 9.5791345
## [7] 7.2910646 2.2064805 3.5390966 2.9473889 2.6469747 10.4333455
## [13] 13.2415547 8.3056697 4.8751556 7.6517494 2.3173225 4.1732837
## [1] 0.99985487 0.10436583 0.92780908 0.61429592 0.45730614 0.14353133
## [7] 0.29476644 0.89976287 0.73876101 0.81542054 0.85166984 0.10754949
## [13] 0.03935644 0.21655352 0.55992235 0.26474304 0.88832587 0.65323937
## [1] 12.59159
# Cuadro de resultados con las comunidades autonomas
resultados_completos <- data.frame(
comunidades = DF$CCAA,
D_cuadrado = D2,
Valor_P = valor,
Es_Outlier = ifelse(es_outlier, "SĆ", "No")
)
kable(resultados_completos,
caption = "AnÔlisis de Distancias de Mahalanobis por Comunidad Autónoma",
digits = c(0, 3, 4, 0),
col.names = c("CC.AA.", "D²", "Valor P", "Outlier")) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(which(resultados_completos$Es_Outlier == "SĆ"),
background = "#FFE6E6", bold = TRUE) %>%
add_header_above(c(" " = 1, "Distancia de Mahalanobis" = 3))| CC.AA. | D² | Valor P | Outlier |
|---|---|---|---|
| EspaƱa | 0.196 | 0.9999 | No |
| Andalucia | 10.521 | 0.1044 | No |
| Aragon | 1.910 | 0.9278 | No |
| Asturias | 4.463 | 0.6143 | No |
| Baleares | 5.703 | 0.4573 | No |
| Canarias | 9.579 | 0.1435 | No |
| Cantabria | 7.291 | 0.2948 | No |
| Castilla y León | 2.206 | 0.8998 | No |
| Cast.-La Mancha | 3.539 | 0.7388 | No |
| CataluƱa | 2.947 | 0.8154 | No |
| Com. Valenciana | 2.647 | 0.8517 | No |
| Extremadura | 10.433 | 0.1075 | No |
| Galicia | 13.242 | 0.0394 | SĆ |
| Madrid | 8.306 | 0.2166 | No |
| Murcia | 4.875 | 0.5599 | No |
| Navarra | 7.652 | 0.2647 | No |
| PaĆs Vasco | 2.317 | 0.8883 | No |
| La Rioja | 4.173 | 0.6532 | No |
3. Realizando dendogramas de todos los disintos metodos de conglomeracion.
3.1 Metodo āeuclideanā
# Distancias entre las observaciones
matriz.dis.euclid.caso3 <- dist(DF[, c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono")], method = "euclidean", diag = TRUE)
# metodo "ward"
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3, method = "ward.D2")
# GrƔfico
ward <- plot(hclust.ward.caso3, labels = DF$CCAA)3.2 Metodo āaverageā
# Metodo "average"
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3, method = "average")
# Grafica
average <- plot(hclust.average.caso3, labels = DF$CCAA)3.3 Metodo ācompleteā
# mƩtodo "complete"
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")
# GrƔfica
complete <- plot(hclust.complete.caso3, labels = DF$CCAA)3.4 Metodo āsingleā
# mƩtodo "single"
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3, method = "single")
# GrƔfica
single <- plot(hclust.single.caso3, labels = DF$CCAA)3.5 metodo ācentroidā
# mƩtodo centroid
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3, method = "centroid")
# GrƔfica
plot(hclust.centroid.caso3, labels = DF$CCAA)
Se realiza el proceso para calcular el numero de cluster utilizando los
metodos cuyos dedogramas no estwn distosionados y menos dudas generen;
en este caso para āwardā y ācompleteā
4. Generando los indices para los distintos metodos
4.1 usando āWard.D2ā
library(NbClust)
Datos.NbClust <- DF[, c("Automovil", "TV_color", "Video", "Microondas", "Lavavajillas", "Telefono")]
res.wardD2 <- NbClust (Datos.NbClust, distance = "euclidean" , min.nc=2, max.nc = 15, method = "ward.D2", index = "alllong" )## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
4.2 usando āCompleteā
res.complete <- NbClust (Datos.NbClust, distance = "euclidean" , min.nc=2, max.nc = 15, method = "complete" , index = "alllong")## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
4.3 Visualizando los graficos diviendolo em los grupos encontrados
plot(hclust.ward.caso3, labels = DF$CCAA)
# Dividir las observaciones en 2 grupos basƔndose en el dendrograma
# `cutree()` asigna cada observación a uno de los 2 grupos formados al cortar el Dendrograma
groups <- cutree(hclust.ward.caso3, k = 3)
# Dibujar rectƔngulos en el dendrograma alrededor de los dos grupos formados
# `rect.hclust()` destaca visualmente los clĆŗsteres en el dendrograma
# `k = 2` indica que se deben dibujar rectƔngulos alrededor de 2 grupos
# `border = "red"` especifica que los rectƔngulos deben tener un borde rojo
rect.hclust(hclust.ward.caso3, k = 3, border = "green")#Realizar agrupamiento jerƔrquico utilizando el mƩtodo de enlace completo ("complete linkage")
# `matriz.dis.euclid.caso3` es la matriz de distancias euclidianas entre las observaciones
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3, method = "complete")
# Graficar el dendrograma resultante del agrupamiento jerƔrquico
# Las etiquetas (`labels`) corresponden a la columna `ce.AA.` del conjunto de datos original
plot(hclust.complete.caso3, labels = DF$CCAA)
# Dividir las observaciones en 2 grupos basƔndose en el dendrograma
# `cutree()` asigna cada observación a uno de los 2 grupos formados al cortar el dendrograma
groups <- cutree(hclust.complete.caso3, k = 3)
# Dibujar rectƔngulos en el dendrograma alrededor de los dos grupos formados
# `rect.hclust()` añade rectÔngulos para resaltar visualmente los clústeres
# `k = 2` indica que se deben formar 2 grupos
# `border = "red"` especifica que el borde de los rectƔngulos debe ser rojo
rect.hclust(hclust.complete.caso3, k = 3, border = "green")5. obteniendo los centroides para el metodo ward.d2
grupo.ward <- cutree(hclust.ward.caso3, k = 3, h = NULL)
# Combinar
datos.caso3.grupos <- cbind(DF, grupo.ward)
datos.caso3.grupos$CCAA <- NULL
round(aggregate(datos.caso3.grupos, list(grupo.ward), mean), 2) ->datos.caso3
print(datos.caso3)## Group.1 Automovil TV_color Video Microondas Lavavajillas Telefono grupo.ward
## 1 1 70.33 98.05 67.23 26.70 11.63 81.50 1
## 2 2 70.70 98.53 63.47 44.70 22.43 90.23 2
## 3 3 63.40 95.58 48.12 24.15 11.98 79.92 3
Obtuvimos la media de las seies variables analizados en cada uno de los tres conglomerdos que el metodo WARD.D2 obtuvo.
6. Usando los centroides utilizando metodos no jerarquicos
6.1 obteniendo los centroides en vectores y ejecutando K-means
library(dplyr)
datos_means <- DF[,-1]
c1 <- filter(datos.caso3, Group.1 == 1)%>% dplyr::select(Automovil,TV_color,Video, Microondas,Lavavajillas,Telefono)
c2 <- filter(datos.caso3, Group.1 == 2) %>% dplyr::select(Automovil,TV_color,Video, Microondas, Lavavajillas, Telefono)
c3 <- filter(datos.caso3, Group.1 == 3) %>% dplyr::select (Automovil, TV_color, Video, Microondas, Lavavajillas, Telefono)
km <- kmeans(datos_means, centers = rbind(c1, c2, c3)) |> print( )## K-means clustering with 3 clusters of sizes 6, 6, 6
##
## Cluster means:
## Automovil TV_color Video Microondas Lavavajillas Telefono
## 1 70.33333 98.05000 67.23333 26.70 11.63333 81.50000
## 2 70.70000 98.53333 63.46667 44.70 22.43333 90.23333
## 3 63.40000 95.58333 48.11667 24.15 11.98333 79.91667
##
## Clustering vector:
## [1] 1 1 2 3 1 1 3 3 3 2 1 3 3 2 1 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 698.7550 848.3533 825.6883
## (between_SS / total_SS = 61.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
7. Visualizamos el agrupamiento utilizando la respuesta de kmeans (descomponiendo en componentes principales)
pca <- prcomp(df, scale. = TRUE)
nombres <- (DF$CCAA)
pca_data <- data.frame(
PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
Cluster = as.factor(km$cluster)
)
library(ggplot2)
library(ggrepel)
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster, label = nombres)) +
geom_point(size = 3) +
geom_text_repel(size = 3, segment.color = NA) +
labs(title = "Clusters de k-means usando PCA",
x = "Componente Principal 1",
y = "Componente Principal 2") +
theme_minimal()8. realizando Prueba ANOVA de un factor con cada cluster como factor y cada componente principal como variable dependiente.
## Df Sum Sq Mean Sq F value Pr(>F)
## Cluster 2 44.54 22.269 24.96 1.69e-05 ***
## Residuals 15 13.38 0.892
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Df Sum Sq Mean Sq F value Pr(>F)
## Cluster 2 11.465 5.733 8.914 0.00281 **
## Residuals 15 9.647 0.643
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = PC1 ~ Cluster, data = pca_data)
##
## $Cluster
## diff lwr upr p adj
## 2-1 -1.965921 -3.3822917 -0.5495504 0.0068938
## 3-1 1.886901 0.4705307 3.3032720 0.0092154
## 3-2 3.852822 2.4364517 5.2691930 0.0000109
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = PC2 ~ Cluster, data = pca_data)
##
## $Cluster
## diff lwr upr p adj
## 2-1 1.5852105 0.3825521 2.787869 0.0099169
## 3-1 1.7834042 0.5807458 2.986063 0.0042061
## 3-2 0.1981937 -1.0044647 1.400852 0.9045637
Interpretacion:
| Comparación | PC1 | PC2 | Significativo |
|---|---|---|---|
| 2 vs 1 | 0.0069 | 0.0099 | Si para ambos |
| 3 vs 1 | 0.0092 | 0.0042 | Si para ambos |
| 3 vs 2 | <0.0001 | 0.9046 | Solo en PC1 |