A32-APLICACION DE LOS CONGLOMERADOS

Indicaciones Generales: Presente en un reporte de Rmardown todas las operaciones que se indican en la tarea, debe incluir el archivo rmd, la correspondiente versión html y el informe publicado en una cuenta de Rpubs (incluir el enlace en los comentarios de entrega de las tareas, y en un archivo txt dentro de la entrega), incluyendo los nombres y apellidos de cada uno de los integrantes del grupo.

Del texto: Joaquin, A. M., & Ezequiel, U. J. (2017). Análisis multivariante aplicado con R. 2a ed. Ediciones Paraninfo, S.A.

Caso 3.7: Diseño de un Plan de Incentivos para Vendedores

Carga de datos

library(knitr)

df_ccaa <- data.frame(
  CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", 
           "Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña", 
           "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", 
           "Navarra", "País Vasco", "La Rioja"),
  automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 
                72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
  tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 
              98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
  video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 
            68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
  microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 
                 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
  lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.8, 11.2, 14.0, 7.1, 19.8, 
                   12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
  telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 
               84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)

kable(head(df_ccaa), caption = "Primeras 6 filas de los datos de equipamiento por CCAA")

Primeras 6 filas de los datos de equipamiento por CCAA
CCAA	automovil	tvcolor	video	microondas	lavavajillas	telefono
España	69.0	97.6	62.4	32.3	17.0	85.2
Andalucía	66.7	98.0	82.7	24.1	12.7	74.7
Aragón	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Baleares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4

1. Detección de Outliers

variables <- df_ccaa[, -1]  
mahal_dist <- mahalanobis(variables, 
                         colMeans(variables), 
                         cov(variables))

p_values <- pchisq(mahal_dist, df = ncol(variables), lower.tail = FALSE)

outliers_table <- data.frame(
  CCAA = df_ccaa$CCAA,
  D2 = round(mahal_dist, 2),
  p_value = round(p_values, 2)
)

kable(outliers_table, caption = "Detección de Outliers mediante Distancia de Mahalanobis")

Detección de Outliers mediante Distancia de Mahalanobis
CCAA	D2	p_value
España	0.20	1.00
Andalucía	10.52	0.10
Aragón	1.91	0.93
Asturias	4.46	0.61
Baleares	5.70	0.46
Canarias	9.58	0.14
Cantabria	7.29	0.29
Castilla y León	2.21	0.90
Castilla-La Mancha	3.54	0.74
Cataluña	2.95	0.82
Com. Valenciana	2.65	0.85
Extremadura	10.43	0.11
Galicia	13.24	0.04
Madrid	8.31	0.22
Murcia	4.88	0.56
Navarra	7.65	0.26
País Vasco	2.32	0.89
La Rioja	4.17	0.65

outliers <- which(p_values < 0.05)
if(length(outliers) > 0) {
  outliers_df <- data.frame(
    "Comunidades Outliers" = df_ccaa[outliers, "CCAA"],
    "Valor p" = round(p_values[outliers], 4)
  )
  kable(outliers_df, caption = "Comunidades identificadas como outliers (p < 0.05)")
} else {
  no_outliers_df <- data.frame(
    Resultado = "No se detectaron outliers significativos (p < 0.05)"
  )
  kable(no_outliers_df, caption = "Resultado de detección de outliers")
}

Comunidades identificadas como outliers (p < 0.05)
Comunidades.Outliers	Valor.p
Galicia	0.0394

2. Análisis de Conglomerados Jerárquico

Matriz de Distancias

# Calcular matriz de distancias euclídeas
dist_matrix <- dist(variables, method = "euclidean")
dist_subset <- as.matrix(dist_matrix)[1:5, 1:5]
kable(round(dist_subset, 2), caption = "Matriz de distancias euclídeas (primeras 5 filas y columnas)")

Matriz de distancias euclídeas (primeras 5 filas y columnas)
1	2	3	4	5
0.00	24.77	13.45	14.98	8.43
24.77	0.00	35.97	33.66	25.56
13.45	35.97	0.00	21.31	18.72
14.98	33.66	21.31	0.00	15.02
8.43	25.56	18.72	15.02	0.00

Dondogramas con diferentes Metodos

par(mfrow = c(2, 3))

hc_ward <- hclust(dist_matrix, method = "ward.D2")
plot(hc_ward, main = "Método Ward", 
     xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_ward, k = 2, border = "red")

hc_single <- hclust(dist_matrix, method = "single")
plot(hc_single, main = "Vecino más cercano", 
     xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_single, k = 2, border = "red")

hc_complete <- hclust(dist_matrix, method = "complete")
plot(hc_complete, main = "Vecino más lejano", 
     xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_complete, k = 2, border = "red")

hc_average <- hclust(dist_matrix, method = "average")
plot(hc_average, main = "Vinculación promedio", 
     xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_average, k = 2, border = "red")

hc_centroid <- hclust(dist_matrix, method = "centroid")
plot(hc_centroid, main = "Centroide", 
     xlab = "Comunidades Autónomas", ylab = "Distancia")
rect.hclust(hc_centroid, k = 2, border = "red")

par(mfrow = c(1, 1))

Determinacion del Numero Optimo de Conglmerados

library(NbClust)
set.seed(123)
nbclust_result <- NbClust(variables, 
                         distance = "euclidean",
                         min.nc = 2, 
                         max.nc = 8, 
                         method = "ward.D2",
                         index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 10 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 6 proposed 7 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

best_nc_table <- data.frame(
  Criterio = names(nbclust_result$Best.nc[1,]),
  Número_Clusters = as.numeric(nbclust_result$Best.nc[1,])
)

kable(best_nc_table, caption = "Resultados de NbClust - Número óptimo de clusters por criterio")

Resultados de NbClust - Número óptimo de clusters por criterio
Criterio	Número_Clusters
KL	3
CH	3
Hartigan	3
CCC	2
Scott	7
Marriot	3
TrCovW	3
TraceW	3
Friedman	7
Rubin	3
Cindex	2
DB	7
Silhouette	7
Duda	2
PseudoT2	2
Beale	3
Ratkowsky	3
Ball	3
PtBiserial	4
Frey	1
McClain	2
Dunn	7
Hubert	0
SDindex	7
Dindex	0
SDbw	8

freq_table <- table(nbclust_result$Best.nc[1,])
freq_df <- data.frame(
  Número_Clusters = names(freq_table),
  Frecuencia = as.numeric(freq_table)
)

kable(freq_df, caption = "Frecuencia de clusters recomendados (Regla de la mayoría)")

Frecuencia de clusters recomendados (Regla de la mayoría)
Número_Clusters	Frecuencia
0	2
1	1
2	5
3	10
4	1
7	6
8	1

3. Analisis de Conglomerados No Jerarquico (K-means)

Aplicar K-means con 2 clusters

set.seed(123)
kmeans_result <- kmeans(variables, centers = 2, nstart = 25)

kmeans_summary <- data.frame(
  Métrica = c("Número de clusters", "Total within-cluster sum of squares", 
              "Between-cluster sum of squares", "Total sum of squares"),
  Valor = c(2, 
            round(kmeans_result$tot.withinss, 2),
            round(kmeans_result$betweenss, 2),
            round(kmeans_result$totss, 2))
)

kable(kmeans_summary, caption = "Resumen de resultados de K-means")

Resumen de resultados de K-means
Métrica	Valor
Número de clusters	2.00
Total within-cluster sum of squares	3659.00
Between-cluster sum of squares	2505.31
Total sum of squares	6164.31

df_ccaa$cluster_kmeans <- kmeans_result$cluster

cluster1_comunidades <- df_ccaa$CCAA[df_ccaa$cluster_kmeans == 1]
cluster2_comunidades <- df_ccaa$CCAA[df_ccaa$cluster_kmeans == 2]

max_length <- max(length(cluster1_comunidades), length(cluster2_comunidades))
cluster1_comunidades <- c(cluster1_comunidades, rep("", max_length - length(cluster1_comunidades)))
cluster2_comunidades <- c(cluster2_comunidades, rep("", max_length - length(cluster2_comunidades)))

cluster_assignments <- data.frame(
  Cluster_1 = cluster1_comunidades,
  Cluster_2 = cluster2_comunidades
)

kable(cluster_assignments, caption = "Asignación de comunidades por cluster (K-means)")

Asignación de comunidades por cluster (K-means)
Cluster_1	Cluster_2
España	Aragón
Andalucía	Cataluña
Asturias	Madrid
Baleares	Navarra
Canarias	País Vasco
Cantabria	La Rioja
Castilla y León
Castilla-La Mancha
Com. Valenciana
Extremadura
Galicia
Murcia

4. Comparacion entre Metodos Jerarquico y No Jerarquico

cluster_hierarchical <- cutree(hc_ward, k = 2)
df_ccaa$cluster_hierarchical <- cluster_hierarchical

comparison_table <- data.frame(
  CCAA = df_ccaa$CCAA,
  Kmeans = df_ccaa$cluster_kmeans,
  Jerarquico = df_ccaa$cluster_hierarchical,
  Coincide = df_ccaa$cluster_kmeans == df_ccaa$cluster_hierarchical
)

kable(comparison_table, caption = "Comparación entre métodos K-means y Jerárquico")

Comparación entre métodos K-means y Jerárquico
CCAA	Kmeans	Jerarquico	Coincide
España	1	1	TRUE
Andalucía	1	1	TRUE
Aragón	2	2	TRUE
Asturias	1	1	TRUE
Baleares	1	1	TRUE
Canarias	1	1	TRUE
Cantabria	1	1	TRUE
Castilla y León	1	1	TRUE
Castilla-La Mancha	1	1	TRUE
Cataluña	2	2	TRUE
Com. Valenciana	1	1	TRUE
Extremadura	1	1	TRUE
Galicia	1	1	TRUE
Madrid	2	2	TRUE
Murcia	1	1	TRUE
Navarra	2	2	TRUE
País Vasco	2	2	TRUE
La Rioja	2	2	TRUE

coincidence_rate <- mean(comparison_table$Coincide) * 100
coincidence_df <- data.frame(
  Métrica = "Porcentaje de coincidencia entre métodos",
  Valor = paste(round(coincidence_rate, 2), "%")
)

kable(coincidence_df, caption = "Coincidencia entre métodos de clustering")

Coincidencia entre métodos de clustering
Métrica	Valor
Porcentaje de coincidencia entre métodos	100 %

5. Caracterizacion de los Conglomerados

Estadisticas Descriptivas por Cluster

library(dplyr)

cluster_means <- df_ccaa %>%
  group_by(cluster_kmeans) %>%
  summarise(
    n = n(),
    across(automovil:telefono, mean, .names = "mean_{.col}")
  )

kable(round(cluster_means, 2), caption = "Medias de variables por cluster")

Medias de variables por cluster
cluster_kmeans	n	mean_automovil	mean_tvcolor	mean_video	mean_microondas	mean_lavavajillas	mean_telefono
1	12	66.87	96.82	57.68	25.42	11.81	80.71
2	6	70.70	98.53	63.47	44.70	22.43	90.23

t_test_results <- data.frame(
  Variable = character(),
  Estadístico_t = numeric(),
  Valor_p = numeric(),
  Significancia = character()
)

variables_test <- names(df_ccaa)[2:7]

for(var in variables_test) {
  test_result <- t.test(df_ccaa[[var]] ~ df_ccaa$cluster_kmeans)
  significancia <- ifelse(test_result$p.value < 0.01, "***",
                         ifelse(test_result$p.value < 0.05, "**",
                               ifelse(test_result$p.value < 0.1, "*", "")))
  
  t_test_results <- rbind(t_test_results, data.frame(
    Variable = var,
    Estadístico_t = round(test_result$statistic, 3),
    Valor_p = round(test_result$p.value, 4),
    Significancia = significancia
  ))
}

kable(t_test_results, caption = "Pruebas t para diferencias entre clusters")

Pruebas t para diferencias entre clusters
	Variable	Estadístico_t	Valor_p	Significancia
t	automovil	-1.811	0.1000
t1	tvcolor	-2.515	0.0238	**
t2	video	-1.189	0.2549
t3	microondas	-6.734	0.0000	***
t4	lavavajillas	-4.605	0.0027	***
t5	telefono	-3.508	0.0035	***

Observacion de datos

library(ggplot2)

pca_result <- prcomp(variables, scale. = TRUE)
pca_df <- as.data.frame(pca_result$x[, 1:2])
pca_df$CCAA <- df_ccaa$CCAA
pca_df$Cluster <- as.factor(df_ccaa$cluster_kmeans)

ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, label = CCAA)) +
  geom_point(size = 3) +
  geom_text(size = 3, vjust = -0.5, hjust = 0.5) +
  scale_color_manual(values = c("1" = "blue", "2" = "red")) +
  labs(title = "Visualización de Conglomerados en Espacio PCA",
       x = paste("Componente Principal 1 (", 
                 round(summary(pca_result)$importance[2,1]*100, 1), "%)", sep = ""),
       y = paste("Componente Principal 2 (", 
                 round(summary(pca_result)$importance[2,2]*100, 1), "%)", sep = "")) +
  theme_minimal() +
  theme(legend.position = "bottom")

## 6. Interpretacion Final

interpretacion_clusters <- data.frame()

for(i in 1:2) {
  cluster_data <- df_ccaa[df_ccaa$cluster_kmeans == i, ]
  comunidades <- paste(cluster_data$CCAA, collapse = ", ")
  
  medias <- colMeans(cluster_data[, 2:7])
  
  for(j in 1:length(medias)) {
    interpretacion_clusters <- rbind(interpretacion_clusters, 
      data.frame(
        Cluster = i,
        Variable = names(medias)[j],
        Media = round(medias[j], 1),
        Comunidades = ifelse(j == 1, comunidades, "")
      ))
  }
}

kable(interpretacion_clusters, caption = "Caracterización detallada de los clusters")

Caracterización detallada de los clusters
	Cluster	Variable	Media	Comunidades
automovil	1	automovil	66.9	España, Andalucía, Asturias, Baleares, Canarias, Cantabria, Castilla y León, Castilla-La Mancha, Com. Valenciana, Extremadura, Galicia, Murcia
tvcolor	1	tvcolor	96.8
video	1	video	57.7
microondas	1	microondas	25.4
lavavajillas	1	lavavajillas	11.8
telefono	1	telefono	80.7
automovil1	2	automovil	70.7	Aragón, Cataluña, Madrid, Navarra, País Vasco, La Rioja
tvcolor1	2	tvcolor	98.5
video1	2	video	63.5
microondas1	2	microondas	44.7
lavavajillas1	2	lavavajillas	22.4
telefono1	2	telefono	90.2

variables_significativas <- t_test_results %>%
  filter(Valor_p < 0.05) %>%
  select(Variable, Valor_p)

if(nrow(variables_significativas) > 0) {
  kable(variables_significativas, caption = "Variables con diferencias estadísticamente significativas (p < 0.05)")
} else {
  no_significativas <- data.frame(
    Mensaje = "No hay variables con diferencias significativas al 5%"
  )
  kable(no_significativas, caption = "Resultado de pruebas de significancia")
}

Variables con diferencias estadísticamente significativas (p < 0.05)
	Variable	Valor_p
t1	tvcolor	0.0238
t3	microondas	0.0000
t4	lavavajillas	0.0027
t5	telefono	0.0035

resumen_ejecutivo <- data.frame(
  Cluster = c(1, 2),
  N_Comunidades = c(sum(df_ccaa$cluster_kmeans == 1), 
                    sum(df_ccaa$cluster_kmeans == 2)),
  Característica_Principal = c("Menor nivel de equipamiento", 
                               "Mayor nivel de equipamiento"),
  Variables_Significativas = c(
    paste(variables_significativas$Variable, collapse = ", "),
    paste(variables_significativas$Variable, collapse = ", ")
  )
)

kable(resumen_ejecutivo, caption = "Resumen ejecutivo de los clusters identificados")

Resumen ejecutivo de los clusters identificados
Cluster	N_Comunidades	Característica_Principal	Variables_Significativas
1	12	Menor nivel de equipamiento	tvcolor, microondas, lavavajillas, telefono
2	6	Mayor nivel de equipamiento	tvcolor, microondas, lavavajillas, telefono