# Crear un data.frame con los datos del cuadro 3.22
Datos_caso_3_3<- data.frame(
    CC.AA. = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
    automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
    tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
    video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
    microond = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
    lavavaji = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
    telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
    )
Datos_caso_3_3

##                CC.AA. automovil tvcolor video microond lavavaji telefono
## 1              España      69.0    97.6  62.4     32.3     17.0     85.2
## 2           Andalucía      66.7    98.0  82.7     24.1     12.7     74.7
## 3              Aragón      67.2    97.5  56.8     43.4     20.6     88.4
## 4            Asturias      63.7    95.2  52.1     24.4     13.3     88.1
## 5            Baleares      71.9    98.8  62.4     29.8     10.1     87.9
## 6            Canarias      72.7    96.8  68.4     27.9      5.8     75.4
## 7           Cantabria      63.4    94.9  48.9     36.5     11.2     80.5
## 8     Castilla y León      65.8    97.1  47.7     28.1     14.0     85.0
## 9  Castilla-La Mancha      61.5    97.3  53.6     21.7      7.1     72.9
## 10           Cataluña      70.4    98.1  71.1     36.8     19.8     92.2
## 11    Com. Valenciana      72.7    98.4  68.2     26.6     12.1     84.4
## 12        Extremadura      60.5    97.7  43.7     20.7     11.7     67.1
## 13            Galicia      65.5    91.3  42.7     13.5     14.6     85.9
## 14             Madrid      74.0    99.4  76.3     53.9     32.3     95.7
## 15             Murcia      69.0    98.7  59.3     19.5     12.1     81.4
## 16            Navarra      76.4    99.3  60.6     44.0     20.6     87.4
## 17         País Vasco      71.3    98.3  61.6     45.7     23.7     94.3
## 18           La Rioja      64.9    98.6  54.4     44.4     17.6     83.4

#Matriz de datos
matriz_datos_3_3 <- as.matrix(Datos_caso_3_3[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")])

Detección de los outliers mediante la distacia de Mahalanobis

library(dplyr)        # <-- necesario para %>%

## Warning: package 'dplyr' was built under R version 4.4.1

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## 
## Adjuntando el paquete: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

mean <- colMeans(matriz_datos_3_3)
Sx <- cov(matriz_datos_3_3)

D2 <- mahalanobis(matriz_datos_3_3, mean, Sx)

p_value <- pchisq(D2, df = ncol(matriz_datos_3_3), lower.tail = FALSE)

Resultado <- data.frame(
  CC.AA. = c("Espana", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
  D2 = D2,
  p_value = p_value
)

Resultado %>% 
  kable(caption = "D2 Y PVALUE", align = "c",digits = 2) %>%  
  kable_classic(html_font = "Times New Roman", font_size = 14) %>%  kable_styling()

## Warning in attr(x, "align"): 'xfun::attr()' está en desuso.
## Utilizar 'xfun::attr2()' en su lugar.
## Ver help("Deprecated")

## Warning in attr(x, "align"): 'xfun::attr()' está en desuso.
## Utilizar 'xfun::attr2()' en su lugar.
## Ver help("Deprecated")

D2 Y PVALUE
CC.AA.	D2	p_value
Espana	0.20	1.00
Andalucía	10.52	0.10
Aragón	1.91	0.93
Asturias	4.46	0.61
Baleares	5.70	0.46
Canarias	9.58	0.14
Cantabria	7.29	0.29
Castilla y Leon	2.21	0.90
Castilla-La Mancha	3.54	0.74
Cataluna	2.95	0.82
Com. Valenciana	2.65	0.85
Extremadura	10.43	0.11
Galicia	13.24	0.04
Madrid	8.31	0.22
Murcia	4.88	0.56
Navarra	7.65	0.26
País Vasco	2.32	0.89
La Rioja	4.17	0.65

Análisis de conglomerados gerárquicos

matriz.dis.euclid.caso3 <- dist(matriz_datos_3_3, method = "euclidean")

# Promedio (average)
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,method = "average")

plot(hclust.average.caso3)

# Ward (ward.D2)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3,method = "ward.D2")

plot(hclust.ward.caso3)

# Vecino mas lejano (complete)
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3,method = "complete")

plot(hclust.complete.caso3)

# Vecino mas cercano (single)
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3,method = "single")

plot(hclust.single.caso3)

# Centroide (centroid)
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3,method = "centroid")

plot(hclust.centroid.caso3)

Determinación del número de conglomerados

library(NbClust)

# Ward (ward.D2)
res.wardD2 <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

# Vecino mas lejano 
res.complete <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "complete", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides resultantes del método jerárquico

grupo.ward <- cutree(hclust.ward.caso3,k = 2,h = NULL)
datos.caso3.grupos <- cbind(matriz_datos_3_3,grupo.ward)

round(aggregate(datos.caso3.grupos,list(grupo.ward),mean),2)

##   Group.1 automovil tvcolor video microond lavavaji telefono grupo.ward
## 1       1     66.87   96.82 57.68    25.42    11.81    80.71          1
## 2       2     70.70   98.53 63.47    44.70    22.43    90.23          2

#Análisis de conglomerados no jerárquicos

datos.caso3.grupos.kmeans <- datos.caso3.grupos[, -ncol(datos.caso3.grupos)]

c1 <- c(70.52,  98.29,  65.35,  35.70,  17.03,  85.87)
c2 <- c(63.40,  95.58,  48.12,  24.15,  11.98,  79.92)
solucion_kmeans <- kmeans(datos.caso3.grupos.kmeans, rbind(c1,c2)) 
  print(solucion_kmeans)

## K-means clustering with 2 clusters of sizes 11, 7
## 
## Cluster means:
##   automovil  tvcolor    video microond lavavaji telefono
## 1  70.65455 98.25455 65.90000 37.17273 17.48182 86.27273
## 2  64.20000 96.02857 49.71429 23.48571 12.00000 80.12857
## 
## Clustering vector:
##  [1] 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2764.2745  988.5257
##  (between_SS / total_SS =  39.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Prueba T

# automovil
t_test_automovil <- t.test(automovil~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# tv_color
t_test_tvcolor <- t.test(tvcolor~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# video
t_test_video <- t.test(video~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# microondas
t_test_microond <- t.test(microond~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# lavavajillas
t_test_lavavaji <- t.test(lavavaji~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

# telefono
t_test_telefono <- t.test(telefono~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)

 # Resultados
resultados_t_test <- data.frame(
  Variable = c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono"),
  Grupo_1 = c(
    t_test_automovil[["estimate"]][["mean in group 1"]],
    t_test_tvcolor[["estimate"]][["mean in group 1"]],
    t_test_video[["estimate"]][["mean in group 1"]],
    t_test_microond[["estimate"]][["mean in group 1"]],
    t_test_lavavaji[["estimate"]][["mean in group 1"]],
    t_test_telefono[["estimate"]][["mean in group 1"]]
  ),
  Grupo_2 = c(
    t_test_automovil[["estimate"]][["mean in group 2"]],
    t_test_tvcolor[["estimate"]][["mean in group 2"]],
    t_test_video[["estimate"]][["mean in group 2"]],
    t_test_microond[["estimate"]][["mean in group 2"]],
    t_test_lavavaji[["estimate"]][["mean in group 2"]],
    t_test_telefono[["estimate"]][["mean in group 2"]]
  ),
  Prueba_t = c(
    t_test_automovil[["statistic"]][["t"]],
    t_test_tvcolor[["statistic"]][["t"]],
    t_test_video[["statistic"]][["t"]],
    t_test_microond[["statistic"]][["t"]],
    t_test_lavavaji[["statistic"]][["t"]],
    t_test_telefono[["statistic"]][["t"]]
  ),
    P_value = c(
    t_test_automovil[["p.value"]],
    t_test_tvcolor[["p.value"]],
    t_test_video[["p.value"]],
    t_test_microond[["p.value"]],
    t_test_lavavaji[["p.value"]],
    t_test_telefono[["p.value"]]
    )
)

resultados_t_test

##    Variable  Grupo_1  Grupo_2 Prueba_t      P_value
## 1 automovil 70.65455 64.20000 4.309306 0.0006522080
## 2   tvcolor 98.25455 96.02857 2.300488 0.0562670096
## 3     video 65.90000 49.71429 4.808811 0.0001987238
## 4  microond 37.17273 23.48571 3.412048 0.0037381119
## 5  lavavaji 17.48182 12.00000 2.302470 0.0380855599
## 6  telefono 86.27273 80.12857 1.744970 0.1068307950

Visualización de los resultados del análisis de conglomerados

library(ggplot2)

mds_result <- cmdscale(matriz.dis.euclid.caso3, k = 2)
coordenadas_mds <- data.frame(Dim1 = mds_result[,1], Dim2 = mds_result[,2])

ciudades <- c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria", 
              "CyL", "CLM", "Cataluna", "Valencia", 
              "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")

coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion_kmeans[["cluster"]])

ggplot(coordenadas_mds, aes(x = Dim1, y = Dim2, color = Cluster)) +
  geom_point(size = 3) +
  geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +  
  labs(x = "Dimension 1", y = "Dimension 2", 
       title = "Visualizacion MDS") +
  theme_minimal() +
  theme(legend.position = "bottom")

A32: APLICACIÓN DE CLÚSTER

YASMIN AZUCENA CALDERON GARCIA GC19041

2025-11-29

Detección de los outliers mediante la distacia de Mahalanobis

Análisis de conglomerados gerárquicos

Determinación del número de conglomerados

Centroides resultantes del método jerárquico

Prueba T

Visualización de los resultados del análisis de conglomerados