El objetivo de este análisis es segmentar las comunidades autónomas españolas en grupos homogéneos según su nivel de equipamiento de hogares, para diseñar planes de incentivos diferenciados para vendedores.
library(knitr)
datos_ccaa <- data.frame(
CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias",
"Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña",
"Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia",
"Navarra", "País Vasco", "La Rioja"),
automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4,
72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1,
98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1,
68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8,
26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8,
12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2,
84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
kable(head(datos_ccaa), caption = "Primeras 6 filas de los datos de equipamiento por CCAA")| CCAA | automovil | tvcolor | video | microondas | lavavajillas | telefono |
|---|---|---|---|---|---|---|
| España | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
| Andalucía | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
| Aragón | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
| Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
| Baleares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
| Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
variables <- datos_ccaa[, -1]
mahal_dist <- mahalanobis(variables,
colMeans(variables),
cov(variables))
p_values <- pchisq(mahal_dist, df = ncol(variables), lower.tail = FALSE)
outliers_table <- data.frame(
CCAA = datos_ccaa$CCAA,
D2 = round(mahal_dist, 2),
p_value = round(p_values, 2)
)
kable(outliers_table, caption = "Detección de Outliers mediante Distancia de Mahalanobis")| CCAA | D2 | p_value |
|---|---|---|
| España | 0.20 | 1.00 |
| Andalucía | 10.52 | 0.10 |
| Aragón | 1.91 | 0.93 |
| Asturias | 4.46 | 0.61 |
| Baleares | 5.70 | 0.46 |
| Canarias | 9.58 | 0.14 |
| Cantabria | 7.29 | 0.29 |
| Castilla y León | 2.21 | 0.90 |
| Castilla-La Mancha | 3.54 | 0.74 |
| Cataluña | 2.95 | 0.82 |
| Com. Valenciana | 2.65 | 0.85 |
| Extremadura | 10.43 | 0.11 |
| Galicia | 13.24 | 0.04 |
| Madrid | 8.31 | 0.22 |
| Murcia | 4.88 | 0.56 |
| Navarra | 7.65 | 0.26 |
| País Vasco | 2.32 | 0.89 |
| La Rioja | 4.17 | 0.65 |
outliers <- which(p_values < 0.05)
if(length(outliers) > 0) {
outliers_df <- data.frame(
"Comunidades Outliers" = datos_ccaa[outliers, "CCAA"],
"Valor p" = round(p_values[outliers], 4)
)
kable(outliers_df, caption = "Comunidades identificadas como outliers (p < 0.05)")
} else {
no_outliers_df <- data.frame(
Resultado = "No se detectaron outliers significativos (p < 0.05)"
)
kable(no_outliers_df, caption = "Resultado de detección de outliers")
}| Comunidades.Outliers | Valor.p |
|---|---|
| Galicia | 0.0394 |
# Calcular matriz de distancias euclídeas
dist_matrix <- dist(variables, method = "euclidean")
dist_subset <- as.matrix(dist_matrix)[1:5, 1:5]
kable(round(dist_subset, 2), caption = "Matriz de distancias euclídeas (primeras 5 filas y columnas)")| 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|
| 0.00 | 24.77 | 13.45 | 14.98 | 8.43 |
| 24.77 | 0.00 | 35.97 | 33.66 | 25.56 |
| 13.45 | 35.97 | 0.00 | 21.31 | 18.72 |
| 14.98 | 33.66 | 21.31 | 0.00 | 15.02 |
| 8.43 | 25.56 | 18.72 | 15.02 | 0.00 |
par(mfrow = c(2, 3))
hc_ward <- hclust(dist_matrix, method = "ward.D2")
plot(hc_ward, main = "Método Ward",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_ward, k = 2, border = "red")
hc_single <- hclust(dist_matrix, method = "single")
plot(hc_single, main = "Vecino más cercano",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_single, k = 2, border = "red")
hc_complete <- hclust(dist_matrix, method = "complete")
plot(hc_complete, main = "Vecino más lejano",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_complete, k = 2, border = "red")
hc_average <- hclust(dist_matrix, method = "average")
plot(hc_average, main = "Vinculación promedio",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_average, k = 2, border = "red")
hc_centroid <- hclust(dist_matrix, method = "centroid")
plot(hc_centroid, main = "Centroide",
xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_centroid, k = 2, border = "red")
par(mfrow = c(1, 1))library(NbClust)
set.seed(123)
nbclust_result <- NbClust(variables,
distance = "euclidean",
min.nc = 2,
max.nc = 8,
method = "ward.D2",
index = "all")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 6 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
best_nc_table <- data.frame(
Criterio = names(nbclust_result$Best.nc[1,]),
Número_Clusters = as.numeric(nbclust_result$Best.nc[1,])
)
kable(best_nc_table, caption = "Resultados de NbClust - Número óptimo de clusters por criterio")| Criterio | Número_Clusters |
|---|---|
| KL | 3 |
| CH | 3 |
| Hartigan | 3 |
| CCC | 2 |
| Scott | 7 |
| Marriot | 3 |
| TrCovW | 3 |
| TraceW | 3 |
| Friedman | 7 |
| Rubin | 3 |
| Cindex | 2 |
| DB | 7 |
| Silhouette | 7 |
| Duda | 2 |
| PseudoT2 | 2 |
| Beale | 3 |
| Ratkowsky | 3 |
| Ball | 3 |
| PtBiserial | 4 |
| Frey | 1 |
| McClain | 2 |
| Dunn | 7 |
| Hubert | 0 |
| SDindex | 7 |
| Dindex | 0 |
| SDbw | 8 |
freq_table <- table(nbclust_result$Best.nc[1,])
freq_df <- data.frame(
Número_Clusters = names(freq_table),
Frecuencia = as.numeric(freq_table)
)
kable(freq_df, caption = "Frecuencia de clusters recomendados (Regla de la mayoría)")| Número_Clusters | Frecuencia |
|---|---|
| 0 | 2 |
| 1 | 1 |
| 2 | 5 |
| 3 | 10 |
| 4 | 1 |
| 7 | 6 |
| 8 | 1 |
set.seed(123)
kmeans_result <- kmeans(variables, centers = 2, nstart = 25)
kmeans_summary <- data.frame(
Métrica = c("Número de clusters", "Total within-cluster sum of squares",
"Between-cluster sum of squares", "Total sum of squares"),
Valor = c(2,
round(kmeans_result$tot.withinss, 2),
round(kmeans_result$betweenss, 2),
round(kmeans_result$totss, 2))
)
kable(kmeans_summary, caption = "Resumen de resultados de K-means")| Métrica | Valor |
|---|---|
| Número de clusters | 2.00 |
| Total within-cluster sum of squares | 3659.00 |
| Between-cluster sum of squares | 2505.31 |
| Total sum of squares | 6164.31 |
datos_ccaa$cluster_kmeans <- kmeans_result$cluster
cluster1_comunidades <- datos_ccaa$CCAA[datos_ccaa$cluster_kmeans == 1]
cluster2_comunidades <- datos_ccaa$CCAA[datos_ccaa$cluster_kmeans == 2]
max_length <- max(length(cluster1_comunidades), length(cluster2_comunidades))
cluster1_comunidades <- c(cluster1_comunidades, rep("", max_length - length(cluster1_comunidades)))
cluster2_comunidades <- c(cluster2_comunidades, rep("", max_length - length(cluster2_comunidades)))
cluster_assignments <- data.frame(
Cluster_1 = cluster1_comunidades,
Cluster_2 = cluster2_comunidades
)
kable(cluster_assignments, caption = "Asignación de comunidades por cluster (K-means)")| Cluster_1 | Cluster_2 |
|---|---|
| España | Aragón |
| Andalucía | Cataluña |
| Asturias | Madrid |
| Baleares | Navarra |
| Canarias | País Vasco |
| Cantabria | La Rioja |
| Castilla y León | |
| Castilla-La Mancha | |
| Com. Valenciana | |
| Extremadura | |
| Galicia | |
| Murcia |
cluster_hierarchical <- cutree(hc_ward, k = 2)
datos_ccaa$cluster_hierarchical <- cluster_hierarchical
comparison_table <- data.frame(
CCAA = datos_ccaa$CCAA,
Kmeans = datos_ccaa$cluster_kmeans,
Jerarquico = datos_ccaa$cluster_hierarchical,
Coincide = datos_ccaa$cluster_kmeans == datos_ccaa$cluster_hierarchical
)
kable(comparison_table, caption = "Comparación entre métodos K-means y Jerárquico")| CCAA | Kmeans | Jerarquico | Coincide |
|---|---|---|---|
| España | 1 | 1 | TRUE |
| Andalucía | 1 | 1 | TRUE |
| Aragón | 2 | 2 | TRUE |
| Asturias | 1 | 1 | TRUE |
| Baleares | 1 | 1 | TRUE |
| Canarias | 1 | 1 | TRUE |
| Cantabria | 1 | 1 | TRUE |
| Castilla y León | 1 | 1 | TRUE |
| Castilla-La Mancha | 1 | 1 | TRUE |
| Cataluña | 2 | 2 | TRUE |
| Com. Valenciana | 1 | 1 | TRUE |
| Extremadura | 1 | 1 | TRUE |
| Galicia | 1 | 1 | TRUE |
| Madrid | 2 | 2 | TRUE |
| Murcia | 1 | 1 | TRUE |
| Navarra | 2 | 2 | TRUE |
| País Vasco | 2 | 2 | TRUE |
| La Rioja | 2 | 2 | TRUE |
coincidence_rate <- mean(comparison_table$Coincide) * 100
coincidence_df <- data.frame(
Métrica = "Porcentaje de coincidencia entre métodos",
Valor = paste(round(coincidence_rate, 2), "%")
)
kable(coincidence_df, caption = "Coincidencia entre métodos de clustering")| Métrica | Valor |
|---|---|
| Porcentaje de coincidencia entre métodos | 100 % |
library(dplyr)
cluster_means <- datos_ccaa %>%
group_by(cluster_kmeans) %>%
summarise(
n = n(),
across(automovil:telefono, mean, .names = "mean_{.col}")
)
kable(round(cluster_means, 2), caption = "Medias de variables por cluster")| cluster_kmeans | n | mean_automovil | mean_tvcolor | mean_video | mean_microondas | mean_lavavajillas | mean_telefono |
|---|---|---|---|---|---|---|---|
| 1 | 12 | 66.87 | 96.82 | 57.68 | 25.42 | 11.81 | 80.71 |
| 2 | 6 | 70.70 | 98.53 | 63.47 | 44.70 | 22.43 | 90.23 |
t_test_results <- data.frame(
Variable = character(),
Estadístico_t = numeric(),
Valor_p = numeric(),
Significancia = character()
)
variables_test <- names(datos_ccaa)[2:7]
for(var in variables_test) {
test_result <- t.test(datos_ccaa[[var]] ~ datos_ccaa$cluster_kmeans)
significancia <- ifelse(test_result$p.value < 0.01, "***",
ifelse(test_result$p.value < 0.05, "**",
ifelse(test_result$p.value < 0.1, "*", "")))
t_test_results <- rbind(t_test_results, data.frame(
Variable = var,
Estadístico_t = round(test_result$statistic, 3),
Valor_p = round(test_result$p.value, 4),
Significancia = significancia
))
}
kable(t_test_results, caption = "Pruebas t para diferencias entre clusters")| Variable | Estadístico_t | Valor_p | Significancia | |
|---|---|---|---|---|
| t | automovil | -1.811 | 0.1000 | |
| t1 | tvcolor | -2.515 | 0.0238 | ** |
| t2 | video | -1.189 | 0.2549 | |
| t3 | microondas | -6.734 | 0.0000 | *** |
| t4 | lavavajillas | -4.605 | 0.0027 | *** |
| t5 | telefono | -3.508 | 0.0035 | *** |
library(ggplot2)
pca_result <- prcomp(variables, scale. = TRUE)
pca_df <- as.data.frame(pca_result$x[, 1:2])
pca_df$CCAA <- datos_ccaa$CCAA
pca_df$Cluster <- as.factor(datos_ccaa$cluster_kmeans)
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, label = CCAA)) +
geom_point(size = 3) +
geom_text(size = 3, vjust = -0.5, hjust = 0.5) +
scale_color_manual(values = c("1" = "blue", "2" = "red")) +
labs(title = "Visualización de Conglomerados en Espacio PCA",
x = paste("Componente Principal 1 (",
round(summary(pca_result)$importance[2,1]*100, 1), "%)", sep = ""),
y = paste("Componente Principal 2 (",
round(summary(pca_result)$importance[2,2]*100, 1), "%)", sep = "")) +
theme_minimal() +
theme(legend.position = "bottom")interpretacion_clusters <- data.frame()
for(i in 1:2) {
cluster_data <- datos_ccaa[datos_ccaa$cluster_kmeans == i, ]
comunidades <- paste(cluster_data$CCAA, collapse = ", ")
medias <- colMeans(cluster_data[, 2:7])
for(j in 1:length(medias)) {
interpretacion_clusters <- rbind(interpretacion_clusters,
data.frame(
Cluster = i,
Variable = names(medias)[j],
Media = round(medias[j], 1),
Comunidades = ifelse(j == 1, comunidades, "")
))
}
}
kable(interpretacion_clusters, caption = "Caracterización detallada de los clusters")| Cluster | Variable | Media | Comunidades | |
|---|---|---|---|---|
| automovil | 1 | automovil | 66.9 | España, Andalucía, Asturias, Baleares, Canarias, Cantabria, Castilla y León, Castilla-La Mancha, Com. Valenciana, Extremadura, Galicia, Murcia |
| tvcolor | 1 | tvcolor | 96.8 | |
| video | 1 | video | 57.7 | |
| microondas | 1 | microondas | 25.4 | |
| lavavajillas | 1 | lavavajillas | 11.8 | |
| telefono | 1 | telefono | 80.7 | |
| automovil1 | 2 | automovil | 70.7 | Aragón, Cataluña, Madrid, Navarra, País Vasco, La Rioja |
| tvcolor1 | 2 | tvcolor | 98.5 | |
| video1 | 2 | video | 63.5 | |
| microondas1 | 2 | microondas | 44.7 | |
| lavavajillas1 | 2 | lavavajillas | 22.4 | |
| telefono1 | 2 | telefono | 90.2 |
variables_significativas <- t_test_results %>%
filter(Valor_p < 0.05) %>%
select(Variable, Valor_p)
if(nrow(variables_significativas) > 0) {
kable(variables_significativas, caption = "Variables con diferencias estadísticamente significativas (p < 0.05)")
} else {
no_significativas <- data.frame(
Mensaje = "No hay variables con diferencias significativas al 5%"
)
kable(no_significativas, caption = "Resultado de pruebas de significancia")
}| Variable | Valor_p | |
|---|---|---|
| t1 | tvcolor | 0.0238 |
| t3 | microondas | 0.0000 |
| t4 | lavavajillas | 0.0027 |
| t5 | telefono | 0.0035 |
resumen_ejecutivo <- data.frame(
Cluster = c(1, 2),
N_Comunidades = c(sum(datos_ccaa$cluster_kmeans == 1),
sum(datos_ccaa$cluster_kmeans == 2)),
Característica_Principal = c("Menor nivel de equipamiento",
"Mayor nivel de equipamiento"),
Variables_Significativas = c(
paste(variables_significativas$Variable, collapse = ", "),
paste(variables_significativas$Variable, collapse = ", ")
)
)
kable(resumen_ejecutivo, caption = "Resumen ejecutivo de los clusters identificados")| Cluster | N_Comunidades | Característica_Principal | Variables_Significativas |
|---|---|---|---|
| 1 | 12 | Menor nivel de equipamiento | tvcolor, microondas, lavavajillas, telefono |
| 2 | 6 | Mayor nivel de equipamiento | tvcolor, microondas, lavavajillas, telefono |