library(knitr)
df_ccaa <- data.frame(
CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias",
"Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña",
"Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
"Navarra", "País Vasco", "La Rioja"),
automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4,
72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1,
98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1,
68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8,
26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8,
12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2,
84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
kable(head(df_ccaa), caption = "Primeras 6 filas de los datos de equipamiento por CCAA")
2. Análisis de Conglomerados Jerárquico
Matriz de Distancias
# Calcular matriz de distancias euclídeas
dist_matrix <- dist(variables, method = "euclidean")
dist_subset <- as.matrix(dist_matrix)[1:5, 1:5]
kable(round(dist_subset, 2), caption = "Matriz de distancias euclídeas (primeras 5 filas y columnas)")
Matriz de distancias euclídeas (primeras 5 filas y
columnas)
| 0.00 |
24.77 |
13.45 |
14.98 |
8.43 |
| 24.77 |
0.00 |
35.97 |
33.66 |
25.56 |
| 13.45 |
35.97 |
0.00 |
21.31 |
18.72 |
| 14.98 |
33.66 |
21.31 |
0.00 |
15.02 |
| 8.43 |
25.56 |
18.72 |
15.02 |
0.00 |
Dondogramas con diferentes Metodos
par(mfrow = c(2, 3))
hc_ward <- hclust(dist_matrix, method = "ward.D2")
plot(hc_ward, main = "Método Ward",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_ward, k = 2, border = "red")
hc_single <- hclust(dist_matrix, method = "single")
plot(hc_single, main = "Vecino más cercano",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_single, k = 2, border = "red")
hc_complete <- hclust(dist_matrix, method = "complete")
plot(hc_complete, main = "Vecino más lejano",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_complete, k = 2, border = "red")
hc_average <- hclust(dist_matrix, method = "average")
plot(hc_average, main = "Vinculación promedio",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_average, k = 2, border = "red")
hc_centroid <- hclust(dist_matrix, method = "centroid")
plot(hc_centroid, main = "Centroide",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_centroid, k = 2, border = "red")
par(mfrow = c(1, 1))

Determinacion del Numero Optimo de Conglmerados
library(NbClust)
set.seed(123)
nbclust_result <- NbClust(variables,
distance = "euclidean",
min.nc = 2,
max.nc = 8,
method = "ward.D2",
index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 6 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
best_nc_table <- data.frame(
Criterio = names(nbclust_result$Best.nc[1,]),
Número_Clusters = as.numeric(nbclust_result$Best.nc[1,])
)
kable(best_nc_table, caption = "Resultados de NbClust - Número óptimo de clusters por criterio")
Resultados de NbClust - Número óptimo de clusters por
criterio
| KL |
3 |
| CH |
3 |
| Hartigan |
3 |
| CCC |
2 |
| Scott |
7 |
| Marriot |
3 |
| TrCovW |
3 |
| TraceW |
3 |
| Friedman |
7 |
| Rubin |
3 |
| Cindex |
2 |
| DB |
7 |
| Silhouette |
7 |
| Duda |
2 |
| PseudoT2 |
2 |
| Beale |
3 |
| Ratkowsky |
3 |
| Ball |
3 |
| PtBiserial |
4 |
| Frey |
1 |
| McClain |
2 |
| Dunn |
7 |
| Hubert |
0 |
| SDindex |
7 |
| Dindex |
0 |
| SDbw |
8 |
freq_table <- table(nbclust_result$Best.nc[1,])
freq_df <- data.frame(
Número_Clusters = names(freq_table),
Frecuencia = as.numeric(freq_table)
)
kable(freq_df, caption = "Frecuencia de clusters recomendados (Regla de la mayoría)")
Frecuencia de clusters recomendados (Regla de la
mayoría)
| 0 |
2 |
| 1 |
1 |
| 2 |
5 |
| 3 |
10 |
| 4 |
1 |
| 7 |
6 |
| 8 |
1 |
5. Caracterizacion de los Conglomerados
Estadisticas Descriptivas por Cluster
library(dplyr)
cluster_means <- df_ccaa %>%
group_by(cluster_kmeans) %>%
summarise(
n = n(),
across(automovil:telefono, mean, .names = "mean_{.col}")
)
kable(round(cluster_means, 2), caption = "Medias de variables por cluster")
Medias de variables por cluster
| 1 |
12 |
66.87 |
96.82 |
57.68 |
25.42 |
11.81 |
80.71 |
| 2 |
6 |
70.70 |
98.53 |
63.47 |
44.70 |
22.43 |
90.23 |
t_test_results <- data.frame(
Variable = character(),
Estadístico_t = numeric(),
Valor_p = numeric(),
Significancia = character()
)
variables_test <- names(df_ccaa)[2:7]
for(var in variables_test) {
test_result <- t.test(df_ccaa[[var]] ~ df_ccaa$cluster_kmeans)
significancia <- ifelse(test_result$p.value < 0.01, "***",
ifelse(test_result$p.value < 0.05, "**",
ifelse(test_result$p.value < 0.1, "*", "")))
t_test_results <- rbind(t_test_results, data.frame(
Variable = var,
Estadístico_t = round(test_result$statistic, 3),
Valor_p = round(test_result$p.value, 4),
Significancia = significancia
))
}
kable(t_test_results, caption = "Pruebas t para diferencias entre clusters")
Pruebas t para diferencias entre clusters
| t |
automovil |
-1.811 |
0.1000 |
|
| t1 |
tvcolor |
-2.515 |
0.0238 |
** |
| t2 |
video |
-1.189 |
0.2549 |
|
| t3 |
microondas |
-6.734 |
0.0000 |
*** |
| t4 |
lavavajillas |
-4.605 |
0.0027 |
*** |
| t5 |
telefono |
-3.508 |
0.0035 |
*** |
Observacion de datos
library(ggplot2)
pca_result <- prcomp(variables, scale. = TRUE)
pca_df <- as.data.frame(pca_result$x[, 1:2])
pca_df$CCAA <- df_ccaa$CCAA
pca_df$Cluster <- as.factor(df_ccaa$cluster_kmeans)
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, label = CCAA)) +
geom_point(size = 3) +
geom_text(size = 3, vjust = -0.5, hjust = 0.5) +
scale_color_manual(values = c("1" = "blue", "2" = "red")) +
labs(title = "Visualización de Conglomerados en Espacio PCA",
x = paste("Componente Principal 1 (",
round(summary(pca_result)$importance[2,1]*100, 1), "%)", sep = ""),
y = paste("Componente Principal 2 (",
round(summary(pca_result)$importance[2,2]*100, 1), "%)", sep = "")) +
theme_minimal() +
theme(legend.position = "bottom")
## 6. Interpretacion Final
interpretacion_clusters <- data.frame()
for(i in 1:2) {
cluster_data <- df_ccaa[df_ccaa$cluster_kmeans == i, ]
comunidades <- paste(cluster_data$CCAA, collapse = ", ")
medias <- colMeans(cluster_data[, 2:7])
for(j in 1:length(medias)) {
interpretacion_clusters <- rbind(interpretacion_clusters,
data.frame(
Cluster = i,
Variable = names(medias)[j],
Media = round(medias[j], 1),
Comunidades = ifelse(j == 1, comunidades, "")
))
}
}
kable(interpretacion_clusters, caption = "Caracterización detallada de los clusters")
Caracterización detallada de los clusters
| automovil |
1 |
automovil |
66.9 |
España, Andalucía, Asturias, Baleares, Canarias,
Cantabria, Castilla y León, Castilla-La Mancha, Com. Valenciana,
Extremadura, Galicia, Murcia |
| tvcolor |
1 |
tvcolor |
96.8 |
|
| video |
1 |
video |
57.7 |
|
| microondas |
1 |
microondas |
25.4 |
|
| lavavajillas |
1 |
lavavajillas |
11.8 |
|
| telefono |
1 |
telefono |
80.7 |
|
| automovil1 |
2 |
automovil |
70.7 |
Aragón, Cataluña, Madrid, Navarra, País Vasco, La
Rioja |
| tvcolor1 |
2 |
tvcolor |
98.5 |
|
| video1 |
2 |
video |
63.5 |
|
| microondas1 |
2 |
microondas |
44.7 |
|
| lavavajillas1 |
2 |
lavavajillas |
22.4 |
|
| telefono1 |
2 |
telefono |
90.2 |
|
variables_significativas <- t_test_results %>%
filter(Valor_p < 0.05) %>%
select(Variable, Valor_p)
if(nrow(variables_significativas) > 0) {
kable(variables_significativas, caption = "Variables con diferencias estadísticamente significativas (p < 0.05)")
} else {
no_significativas <- data.frame(
Mensaje = "No hay variables con diferencias significativas al 5%"
)
kable(no_significativas, caption = "Resultado de pruebas de significancia")
}
Variables con diferencias estadísticamente significativas (p
< 0.05)
| t1 |
tvcolor |
0.0238 |
| t3 |
microondas |
0.0000 |
| t4 |
lavavajillas |
0.0027 |
| t5 |
telefono |
0.0035 |
resumen_ejecutivo <- data.frame(
Cluster = c(1, 2),
N_Comunidades = c(sum(df_ccaa$cluster_kmeans == 1),
sum(df_ccaa$cluster_kmeans == 2)),
Característica_Principal = c("Menor nivel de equipamiento",
"Mayor nivel de equipamiento"),
Variables_Significativas = c(
paste(variables_significativas$Variable, collapse = ", "),
paste(variables_significativas$Variable, collapse = ", ")
)
)
kable(resumen_ejecutivo, caption = "Resumen ejecutivo de los clusters identificados")
Resumen ejecutivo de los clusters identificados
| 1 |
12 |
Menor nivel de equipamiento |
tvcolor, microondas, lavavajillas, telefono |
| 2 |
6 |
Mayor nivel de equipamiento |
tvcolor, microondas, lavavajillas, telefono |