UNIVERSIDAD DE EL SALVADOR

FACULTAD DE CIENCIAS ECONÓMICAS

ESCUELA DE ECONOMÍA

CICLO II-2024

Asignatura:

Métodos para el Análisis Económico

Docente:

Carlos Ademir Pérez Alas.

Tema:

Aplicación: Análisis de Clúster (conglomerados)

Grupo:

2

Integrantes:

Márquez Arévalo, Iris Leonor. MA15003 .

Menjivar Guillen, Carlos Fernando. MG20035.

Santos Guzmán, Orlando Rigoberto SG18058.

Ciudad Universitaria, 10 de diciembre 2024.

3.7 Un ejemplo de aplicación de un análisis de conglomerados

El objetivo de este epígrafe es ofrecer una visión integrada de los pasos que re quiere la aplicación de un análisis de conglomerados, desde el establecimiento de los objetivos hasta la validación de los resultados.

Caso 3.3. Diseño de un plan de insentivos para los vendedores.

# Crear un data.frame con los datos del cuadro 3.22
equipamiento <- data.frame(
   CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", 
             "Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura", 
             "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
   
    automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 
                  74.0, 69.0, 76.4, 71.3, 64.9),
   
   tv_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 
                 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
   
   video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 
              42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
   microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 
                   13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
   lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 
                     14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
   telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 
                 95.7, 81.4, 87.4, 94.3, 83.4)
)
# Visualizar la tabla
equipamiento

Convertimos el dataframe en martriz para uso práctico

#Matriz de datos
matriz_datos_3_3 <- as.matrix(equipamiento[, c("automovil", "tv_color", "video", "microondas", "lavavajillas", "telefono")])

Detección de los outliers mediante la distacia de Mahalanobis

mean <- colMeans(matriz_datos_3_3)
Sx <- cov(matriz_datos_3_3)
D2 <- mahalanobis(matriz_datos_3_3,mean,Sx,inverted = FALSE)
sig <- pchisq(D2,df = 6,lower.tail = FALSE)

resultados_outliers <- data.frame(CC.AA. = c("Espana", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
                                  D2 = D2, Significatividades = sig)

resultados_outliers 

Análisis de conglomerados gerárquicos

matriz.dis.euclid.caso3 <- dist(matriz_datos_3_3, method = "euclidean")

# Promedio (average)
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,method = "average")

plot(hclust.average.caso3)

# Ward (ward.D2)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3,method = "ward.D2")

plot(hclust.ward.caso3)

# Vecino mas lejano (complete)
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3,method = "complete")

plot(hclust.complete.caso3)

# Vecino mas cercano (single)
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3,method = "single")

plot(hclust.single.caso3)

# Centroide (centroid)
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3,method = "centroid")

plot(hclust.centroid.caso3)

Determinación del número de conglomerados

library(NbClust)

# Ward (ward.D2)
res.wardD2 <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "ward.D2", index = "alllong")
## Warning in pf(beale, pp, df2): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in pf(beale, pp, df2): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in pf(beale, pp, df2): Se han producido NaNs

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
# Vecino mas lejano 
res.complete <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "complete", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides resultantes del método jerárquico

grupo.ward <- cutree(hclust.ward.caso3,k = 2,h = NULL)
datos.caso3.grupos <- cbind(matriz_datos_3_3,grupo.ward)

round(aggregate(datos.caso3.grupos,list(grupo.ward),mean),2)

Análisis de conglomerados no jerárquicos

datos.caso3.grupos.kmeans <- datos.caso3.grupos[, -ncol(datos.caso3.grupos)]

c1 <- c(70.52,  98.29,  65.35,  35.70,  17.03,  85.87)
c2 <- c(63.40,  95.58,  48.12,  24.15,  11.98,  79.92)
solucion_kmeans <- kmeans(datos.caso3.grupos.kmeans, rbind(c1,c2)) 
  print(solucion_kmeans)
## K-means clustering with 2 clusters of sizes 11, 7
## 
## Cluster means:
##   automovil tv_color    video microondas lavavajillas telefono
## 1  70.65455 98.25455 65.90000   37.17273     17.48182 86.27273
## 2  64.20000 96.02857 49.71429   23.48571     12.00000 80.12857
## 
## Clustering vector:
##  [1] 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2764.2745  988.5257
##  (between_SS / total_SS =  39.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Prueba T

# automovil
t_test_automovil <- t.test(automovil~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# tv_color
t_test_tvcolor <- t.test(tv_color~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# video
t_test_video <- t.test(video~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# microondas
t_test_microond <- t.test(microondas~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# lavavajillas
t_test_lavavaji <- t.test(lavavajillas~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# telefono
t_test_telefono <- t.test(telefono~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

 # Resultados
resultados_t_test <- data.frame(
  Variable = c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono"),
  Grupo_1 = c(
    t_test_automovil[["estimate"]][["mean in group 1"]],
    t_test_tvcolor[["estimate"]][["mean in group 1"]],
    t_test_video[["estimate"]][["mean in group 1"]],
    t_test_microond[["estimate"]][["mean in group 1"]],
    t_test_lavavaji[["estimate"]][["mean in group 1"]],
    t_test_telefono[["estimate"]][["mean in group 1"]]
  ),
  Grupo_2 = c(
    t_test_automovil[["estimate"]][["mean in group 2"]],
    t_test_tvcolor[["estimate"]][["mean in group 2"]],
    t_test_video[["estimate"]][["mean in group 2"]],
    t_test_microond[["estimate"]][["mean in group 2"]],
    t_test_lavavaji[["estimate"]][["mean in group 2"]],
    t_test_telefono[["estimate"]][["mean in group 2"]]
  ),
  Prueba_t = c(
    t_test_automovil[["statistic"]][["t"]],
    t_test_tvcolor[["statistic"]][["t"]],
    t_test_video[["statistic"]][["t"]],
    t_test_microond[["statistic"]][["t"]],
    t_test_lavavaji[["statistic"]][["t"]],
    t_test_telefono[["statistic"]][["t"]]
  ),
    P_value = c(
    t_test_automovil[["p.value"]],
    t_test_tvcolor[["p.value"]],
    t_test_video[["p.value"]],
    t_test_microond[["p.value"]],
    t_test_lavavaji[["p.value"]],
    t_test_telefono[["p.value"]]
    )
)

resultados_t_test 

Visualización de los resultados del análisis de conglomerados

library(ggplot2)

mds_result <- cmdscale(matriz.dis.euclid.caso3, k = 2)
coordenadas_mds <- data.frame(Dim1 = mds_result[,1], Dim2 = mds_result[,2])

ciudades <- c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", 
              "CyL", "CLM", "Cataluna", "Valencia", 
              "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")

coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion_kmeans[["cluster"]])

ggplot(coordenadas_mds, aes(x = Dim1, y = Dim2, color = Cluster)) +
  geom_point(size = 3) +
  geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +  
  labs(x = "Dimension 1", y = "Dimension 2", 
       title = "Visualizacion MDS") +
  theme_minimal() +
  theme(legend.position = "bottom")