UNIVERSIDAD DE EL SALVADOR
FACULTAD DE CIENCIAS ECONÓMICAS
ESCUELA DE ECONOMÍA
CICLO II-2024
Asignatura:
Métodos para el Análisis Económico
Docente:
Carlos Ademir Pérez Alas.
Tema:
Aplicación: Análisis de Clúster (conglomerados)
Grupo:
2
Integrantes:
Márquez Arévalo, Iris Leonor. MA15003 .
Menjivar Guillen, Carlos Fernando. MG20035.
Santos Guzmán, Orlando Rigoberto SG18058.
Ciudad Universitaria, 10 de diciembre 2024.
El objetivo de este epígrafe es ofrecer una visión integrada de los pasos que re quiere la aplicación de un análisis de conglomerados, desde el establecimiento de los objetivos hasta la validación de los resultados.
# Crear un data.frame con los datos del cuadro 3.22
equipamiento <- data.frame(
CCAA = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria",
"Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura",
"Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5,
74.0, 69.0, 76.4, 71.3, 64.9),
tv_color = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7,
91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7,
42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
microondas = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7,
13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
lavavajillas = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7,
14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9,
95.7, 81.4, 87.4, 94.3, 83.4)
)
# Visualizar la tabla
equipamientoConvertimos el dataframe en martriz para uso práctico
mean <- colMeans(matriz_datos_3_3)
Sx <- cov(matriz_datos_3_3)
D2 <- mahalanobis(matriz_datos_3_3,mean,Sx,inverted = FALSE)
sig <- pchisq(D2,df = 6,lower.tail = FALSE)
resultados_outliers <- data.frame(CC.AA. = c("Espana", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
D2 = D2, Significatividades = sig)
resultados_outliers matriz.dis.euclid.caso3 <- dist(matriz_datos_3_3, method = "euclidean")
# Promedio (average)
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,method = "average")
plot(hclust.average.caso3)# Ward (ward.D2)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3,method = "ward.D2")
plot(hclust.ward.caso3)# Vecino mas lejano (complete)
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3,method = "complete")
plot(hclust.complete.caso3)# Vecino mas cercano (single)
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3,method = "single")
plot(hclust.single.caso3)# Centroide (centroid)
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3,method = "centroid")
plot(hclust.centroid.caso3)library(NbClust)
# Ward (ward.D2)
res.wardD2 <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "ward.D2", index = "alllong")## Warning in pf(beale, pp, df2): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in pf(beale, pp, df2): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in log(det(P)/det(W)): Se han producido NaNs
## Warning in pf(beale, pp, df2): Se han producido NaNs
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# Vecino mas lejano
res.complete <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "complete", index = "alllong")## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
grupo.ward <- cutree(hclust.ward.caso3,k = 2,h = NULL)
datos.caso3.grupos <- cbind(matriz_datos_3_3,grupo.ward)
round(aggregate(datos.caso3.grupos,list(grupo.ward),mean),2)datos.caso3.grupos.kmeans <- datos.caso3.grupos[, -ncol(datos.caso3.grupos)]
c1 <- c(70.52, 98.29, 65.35, 35.70, 17.03, 85.87)
c2 <- c(63.40, 95.58, 48.12, 24.15, 11.98, 79.92)
solucion_kmeans <- kmeans(datos.caso3.grupos.kmeans, rbind(c1,c2))
print(solucion_kmeans)## K-means clustering with 2 clusters of sizes 11, 7
##
## Cluster means:
## automovil tv_color video microondas lavavajillas telefono
## 1 70.65455 98.25455 65.90000 37.17273 17.48182 86.27273
## 2 64.20000 96.02857 49.71429 23.48571 12.00000 80.12857
##
## Clustering vector:
## [1] 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 2764.2745 988.5257
## (between_SS / total_SS = 39.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# automovil
t_test_automovil <- t.test(automovil~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# tv_color
t_test_tvcolor <- t.test(tv_color~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# video
t_test_video <- t.test(video~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# microondas
t_test_microond <- t.test(microondas~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# lavavajillas
t_test_lavavaji <- t.test(lavavajillas~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# telefono
t_test_telefono <- t.test(telefono~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# Resultados
resultados_t_test <- data.frame(
Variable = c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono"),
Grupo_1 = c(
t_test_automovil[["estimate"]][["mean in group 1"]],
t_test_tvcolor[["estimate"]][["mean in group 1"]],
t_test_video[["estimate"]][["mean in group 1"]],
t_test_microond[["estimate"]][["mean in group 1"]],
t_test_lavavaji[["estimate"]][["mean in group 1"]],
t_test_telefono[["estimate"]][["mean in group 1"]]
),
Grupo_2 = c(
t_test_automovil[["estimate"]][["mean in group 2"]],
t_test_tvcolor[["estimate"]][["mean in group 2"]],
t_test_video[["estimate"]][["mean in group 2"]],
t_test_microond[["estimate"]][["mean in group 2"]],
t_test_lavavaji[["estimate"]][["mean in group 2"]],
t_test_telefono[["estimate"]][["mean in group 2"]]
),
Prueba_t = c(
t_test_automovil[["statistic"]][["t"]],
t_test_tvcolor[["statistic"]][["t"]],
t_test_video[["statistic"]][["t"]],
t_test_microond[["statistic"]][["t"]],
t_test_lavavaji[["statistic"]][["t"]],
t_test_telefono[["statistic"]][["t"]]
),
P_value = c(
t_test_automovil[["p.value"]],
t_test_tvcolor[["p.value"]],
t_test_video[["p.value"]],
t_test_microond[["p.value"]],
t_test_lavavaji[["p.value"]],
t_test_telefono[["p.value"]]
)
)
resultados_t_test library(ggplot2)
mds_result <- cmdscale(matriz.dis.euclid.caso3, k = 2)
coordenadas_mds <- data.frame(Dim1 = mds_result[,1], Dim2 = mds_result[,2])
ciudades <- c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria",
"CyL", "CLM", "Cataluna", "Valencia",
"Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")
coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion_kmeans[["cluster"]])
ggplot(coordenadas_mds, aes(x = Dim1, y = Dim2, color = Cluster)) +
geom_point(size = 3) +
geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +
labs(x = "Dimension 1", y = "Dimension 2",
title = "Visualizacion MDS") +
theme_minimal() +
theme(legend.position = "bottom")