Determinar si las comunidades autónomas se pueden segmentar en grupos homogéneos respecto al equipamiento de los hogares.
se disponen de los datos del cuadro 3.22 y el objetivo es establecer cuantos grupos de comunidades autonomias con niveles de equipamiento similares pueden establecer y en que radican las diferencias entre esos grupos.
library(knitr)
datos_CC.AA <- data.frame(
CC.AA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias",
"Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña",
"Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
"Navarra", "País Vasco", "La Rioja"),
automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4,
72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1,
98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1,
68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8,
26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8,
12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2,
84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
kable(head(datos_CC.AA), caption = "Cuadro 3.22: Equipamiento de los hogares en distintas comunidades autónomas")| CC.AA | automovil | tvcolor | video | microondas | lavavajillas | telefono |
|---|---|---|---|---|---|---|
| España | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| Andalucía | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragón | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Baleares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
en la medida en que pueden generar importantes distorsiones en la detección del número de grupos.
variables <- datos_CC.AA[, -1]
mahal_dist <- mahalanobis(variables,
colMeans(variables),
cov(variables))
p_values <- pchisq(mahal_dist, df = ncol(variables), lower.tail = FALSE)
outliers_table <- data.frame(
CCAA = datos_CC.AA$CC.AA,
D2 = round(mahal_dist, 2),
p_value = round(p_values, 2)
)
kable(outliers_table, caption = "Detección de _Outliers_")| CCAA | D2 | p_value |
|---|---|---|
| España | 0.20 | 1.00 |
| Andalucía | 10.52 | 0.10 |
| Aragón | 1.91 | 0.93 |
| Asturias | 4.46 | 0.61 |
| Baleares | 5.70 | 0.46 |
| Canarias | 9.58 | 0.14 |
| Cantabria | 7.29 | 0.29 |
| Castilla y León | 2.21 | 0.90 |
| Castilla-La Mancha | 3.54 | 0.74 |
| Cataluña | 2.95 | 0.82 |
| Com. Valenciana | 2.65 | 0.85 |
| Extremadura | 10.43 | 0.11 |
| Galicia | 13.24 | 0.04 |
| Madrid | 8.31 | 0.22 |
| Murcia | 4.88 | 0.56 |
| Navarra | 7.65 | 0.26 |
| País Vasco | 2.32 | 0.89 |
| La Rioja | 4.17 | 0.65 |
outliers_table <- which(p_values < 0.5)
if(length(outliers_table) > 0) {
outliers_table_df <- data.frame(
"comunidad Outliers" =datos_CC.AA [outliers_table, "CC.AA"],
"Valor p" =round(p_values [outliers_table], 4 )
)
kable(outliers_table_df, caption = "Comunidades identificadas (p < 0.05)")
} else {
no_outliers_table_df <- data.frame(
Resultado = "No se detectaron outliers significativos (p < 0.05)"
)
kable(no_outliers_table_df, caption = "Resultado")
}| comunidad.Outliers | Valor.p |
|---|---|
| Andalucía | 0.1044 |
| Baleares | 0.4573 |
| Canarias | 0.1435 |
| Cantabria | 0.2948 |
| Extremadura | 0.1075 |
| Galicia | 0.0394 |
| Madrid | 0.2166 |
| Navarra | 0.2647 |
Evaluando la solucíon de distintos métodos de conglomeracion aplicando los criterios presentados para identificar el numero adecuado de grupos y obtencion de los centroides que han de servir de partida para el siguiente paso.
dist_matrix <- dist(variables, method = "euclidean")
dist_subset <- as.matrix(dist_matrix)[1:5, 1:5]
kable(round(dist_subset, 2), caption = "Matriz de distancias euclídeas (primeras 5 filas y columnas)")| 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|
| 0.00 | 24.77 | 13.45 | 14.98 | 8.43 |
| 24.77 | 0.00 | 35.97 | 33.66 | 25.56 |
| 13.45 | 35.97 | 0.00 | 21.31 | 18.72 |
| 14.98 | 33.66 | 21.31 | 0.00 | 15.02 |
| 8.43 | 25.56 | 18.72 | 15.02 | 0.00 |
par(mfrow = c(2, 3))
hc_ward <- hclust(dist_matrix, method = "ward.D2")
plot(hc_ward, main = "Método Ward",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_ward, k = 2, border = "#CD3333")
hc_single <- hclust(dist_matrix, method = "single")
plot(hc_single, main = "Vecino más cercano",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_single, k = 2, border = "#CD3333")
hc_complete <- hclust(dist_matrix, method = "complete")
plot(hc_complete, main = "Vecino más lejano",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_complete, k = 2, border = "#CD3333")
hc_average <- hclust(dist_matrix, method = "average")
plot(hc_average, main = "Vinculación promedio",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_average, k = 2, border = "#CD3333")
hc_centroid <- hclust(dist_matrix, method = "centroid")
plot(hc_centroid, main = "Centroide",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_centroid, k = 2, border = "#CD3333")
par(mfrow = c(1, 1))library(NbClust)
set.seed(123)
nbclust_result <- NbClust(variables,
distance = "euclidean",
min.nc = 2,
max.nc = 8,
method = "ward.D2",
index = "all")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 6 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
best_nc_table <- data.frame(
Criterio = names(nbclust_result$Best.nc[1,]),
Número_Clusters = as.numeric(nbclust_result$Best.nc[1,])
)
kable(best_nc_table, caption = "Resultados")| Criterio | Número_Clusters |
|---|---|
| KL | 3 |
| CH | 3 |
| Hartigan | 3 |
| CCC | 2 |
| Scott | 7 |
| Marriot | 3 |
| TrCovW | 3 |
| TraceW | 3 |
| Friedman | 7 |
| Rubin | 3 |
| Cindex | 2 |
| DB | 7 |
| Silhouette | 7 |
| Duda | 2 |
| PseudoT2 | 2 |
| Beale | 3 |
| Ratkowsky | 3 |
| Ball | 3 |
| PtBiserial | 4 |
| Frey | 1 |
| McClain | 2 |
| Dunn | 7 |
| Hubert | 0 |
| SDindex | 7 |
| Dindex | 0 |
| SDbw | 8 |
Mediante el metodo de k-medias para la obtencion de una solucion optima en terminos de homogeneidad intrasegmentos y heterogeneidad intersegmentos.
set.seed(123)
kmeans_result <- kmeans(variables, centers = 2, nstart = 25)
kmeans_summary <- data.frame(
Métrica = c("Número de clusters", "Total within-cluster sum of squares",
"Between-cluster sum of squares", "Total sum of squares"),
Valor = c(2,
round(kmeans_result$tot.withinss, 2),
round(kmeans_result$betweenss, 2),
round(kmeans_result$totss, 2))
)
kable(kmeans_summary, caption = "resultados de K-means")| Métrica | Valor |
|---|---|
| Número de clusters | 2.00 |
| Total within-cluster sum of squares | 3659.00 |
| Between-cluster sum of squares | 2505.31 |
| Total sum of squares | 6164.31 |
datos_CC.AA$cluster_kmeans <- kmeans_result$cluster
cluster1_comunidades <- datos_CC.AA$CC.AA[datos_CC.AA$cluster_kmeans == 1]
cluster2_comunidades <- datos_CC.AA$CC.AA[datos_CC.AA$cluster_kmeans == 2]
max_length <- max(length(cluster1_comunidades), length(cluster2_comunidades))
cluster1_comunidades <- c(cluster1_comunidades, rep("", max_length - length(cluster1_comunidades)))
cluster2_comunidades <- c(cluster2_comunidades, rep("", max_length - length(cluster2_comunidades)))
cluster_assignments <- data.frame(
Cluster_1 = cluster1_comunidades,
Cluster_2 = cluster2_comunidades
)
kable(cluster_assignments, caption = "Asignación de comunidades (K-means)")| Cluster_1 | Cluster_2 |
|---|---|
| España | Aragón |
| Andalucía | Cataluña |
| Asturias | Madrid |
| Baleares | Navarra |
| Canarias | País Vasco |
| Cantabria | La Rioja |
| Castilla y León | |
| Castilla-La Mancha | |
| Com. Valenciana | |
| Extremadura | |
| Galicia | |
| Murcia |