# Crear un data.frame con los datos del cuadro 3.22
Datos_caso_3_3<- data.frame(
CC.AA. = c("España", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y León", "Castilla-La Mancha", "Cataluña", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
automovil = c(69.0, 66.7, 67.2, 63.7, 71.9, 72.7, 63.4, 65.8, 61.5, 70.4, 72.7, 60.5, 65.5, 74.0, 69.0, 76.4, 71.3, 64.9),
tvcolor = c(97.6, 98.0, 97.5, 95.2, 98.8, 96.8, 94.9, 97.1, 97.3, 98.1, 98.4, 97.7, 91.3, 99.4, 98.7, 99.3, 98.3, 98.6),
video = c(62.4, 82.7, 56.8, 52.1, 62.4, 68.4, 48.9, 47.7, 53.6, 71.1, 68.2, 43.7, 42.7, 76.3, 59.3, 60.6, 61.6, 54.4),
microond = c(32.3, 24.1, 43.4, 24.4, 29.8, 27.9, 36.5, 28.1, 21.7, 36.8, 26.6, 20.7, 13.5, 53.9, 19.5, 44.0, 45.7, 44.4),
lavavaji = c(17.0, 12.7, 20.6, 13.3, 10.1, 5.80, 11.2, 14.0, 7.10, 19.8, 12.1, 11.7, 14.6, 32.3, 12.1, 20.6, 23.7, 17.6),
telefono = c(85.2, 74.7, 88.4, 88.1, 87.9, 75.4, 80.5, 85.0, 72.9, 92.2, 84.4, 67.1, 85.9, 95.7, 81.4, 87.4, 94.3, 83.4)
)
Datos_caso_3_3
## CC.AA. automovil tvcolor video microond lavavaji telefono
## 1 España 69.0 97.6 62.4 32.3 17.0 85.2
## 2 Andalucía 66.7 98.0 82.7 24.1 12.7 74.7
## 3 Aragón 67.2 97.5 56.8 43.4 20.6 88.4
## 4 Asturias 63.7 95.2 52.1 24.4 13.3 88.1
## 5 Baleares 71.9 98.8 62.4 29.8 10.1 87.9
## 6 Canarias 72.7 96.8 68.4 27.9 5.8 75.4
## 7 Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
## 8 Castilla y León 65.8 97.1 47.7 28.1 14.0 85.0
## 9 Castilla-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
## 10 Cataluña 70.4 98.1 71.1 36.8 19.8 92.2
## 11 Com. Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
## 12 Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
## 13 Galicia 65.5 91.3 42.7 13.5 14.6 85.9
## 14 Madrid 74.0 99.4 76.3 53.9 32.3 95.7
## 15 Murcia 69.0 98.7 59.3 19.5 12.1 81.4
## 16 Navarra 76.4 99.3 60.6 44.0 20.6 87.4
## 17 País Vasco 71.3 98.3 61.6 45.7 23.7 94.3
## 18 La Rioja 64.9 98.6 54.4 44.4 17.6 83.4
#Matriz de datos
matriz_datos_3_3 <- as.matrix(Datos_caso_3_3[, c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono")])
library(dplyr) # <-- necesario para %>%
## Warning: package 'dplyr' was built under R version 4.4.1
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
mean <- colMeans(matriz_datos_3_3)
Sx <- cov(matriz_datos_3_3)
D2 <- mahalanobis(matriz_datos_3_3, mean, Sx)
p_value <- pchisq(D2, df = ncol(matriz_datos_3_3), lower.tail = FALSE)
Resultado <- data.frame(
CC.AA. = c("Espana", "Andalucía", "Aragón", "Asturias", "Baleares", "Canarias", "Cantabria", "Castilla y Leon", "Castilla-La Mancha", "Cataluna", "Com. Valenciana", "Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "País Vasco", "La Rioja"),
D2 = D2,
p_value = p_value
)
Resultado %>%
kable(caption = "D2 Y PVALUE", align = "c",digits = 2) %>%
kable_classic(html_font = "Times New Roman", font_size = 14) %>% kable_styling()
## Warning in attr(x, "align"): 'xfun::attr()' está en desuso.
## Utilizar 'xfun::attr2()' en su lugar.
## Ver help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' está en desuso.
## Utilizar 'xfun::attr2()' en su lugar.
## Ver help("Deprecated")
| CC.AA. | D2 | p_value |
|---|---|---|
| Espana | 0.20 | 1.00 |
| Andalucía | 10.52 | 0.10 |
| Aragón | 1.91 | 0.93 |
| Asturias | 4.46 | 0.61 |
| Baleares | 5.70 | 0.46 |
| Canarias | 9.58 | 0.14 |
| Cantabria | 7.29 | 0.29 |
| Castilla y Leon | 2.21 | 0.90 |
| Castilla-La Mancha | 3.54 | 0.74 |
| Cataluna | 2.95 | 0.82 |
| Com. Valenciana | 2.65 | 0.85 |
| Extremadura | 10.43 | 0.11 |
| Galicia | 13.24 | 0.04 |
| Madrid | 8.31 | 0.22 |
| Murcia | 4.88 | 0.56 |
| Navarra | 7.65 | 0.26 |
| País Vasco | 2.32 | 0.89 |
| La Rioja | 4.17 | 0.65 |
matriz.dis.euclid.caso3 <- dist(matriz_datos_3_3, method = "euclidean")
# Promedio (average)
hclust.average.caso3 <- hclust(matriz.dis.euclid.caso3,method = "average")
plot(hclust.average.caso3)
# Ward (ward.D2)
hclust.ward.caso3 <- hclust(matriz.dis.euclid.caso3,method = "ward.D2")
plot(hclust.ward.caso3)
# Vecino mas lejano (complete)
hclust.complete.caso3 <- hclust(matriz.dis.euclid.caso3,method = "complete")
plot(hclust.complete.caso3)
# Vecino mas cercano (single)
hclust.single.caso3 <- hclust(matriz.dis.euclid.caso3,method = "single")
plot(hclust.single.caso3)
# Centroide (centroid)
hclust.centroid.caso3 <- hclust(matriz.dis.euclid.caso3,method = "centroid")
plot(hclust.centroid.caso3)
library(NbClust)
# Ward (ward.D2)
res.wardD2 <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "ward.D2", index = "alllong")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# Vecino mas lejano
res.complete <- NbClust(matriz_datos_3_3,distance = "euclidean",min.nc = 2,max.nc = 15, method = "complete", index = "alllong")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
grupo.ward <- cutree(hclust.ward.caso3,k = 2,h = NULL)
datos.caso3.grupos <- cbind(matriz_datos_3_3,grupo.ward)
round(aggregate(datos.caso3.grupos,list(grupo.ward),mean),2)
## Group.1 automovil tvcolor video microond lavavaji telefono grupo.ward
## 1 1 66.87 96.82 57.68 25.42 11.81 80.71 1
## 2 2 70.70 98.53 63.47 44.70 22.43 90.23 2
#Análisis de conglomerados no jerárquicos
datos.caso3.grupos.kmeans <- datos.caso3.grupos[, -ncol(datos.caso3.grupos)]
c1 <- c(70.52, 98.29, 65.35, 35.70, 17.03, 85.87)
c2 <- c(63.40, 95.58, 48.12, 24.15, 11.98, 79.92)
solucion_kmeans <- kmeans(datos.caso3.grupos.kmeans, rbind(c1,c2))
print(solucion_kmeans)
## K-means clustering with 2 clusters of sizes 11, 7
##
## Cluster means:
## automovil tvcolor video microond lavavaji telefono
## 1 70.65455 98.25455 65.90000 37.17273 17.48182 86.27273
## 2 64.20000 96.02857 49.71429 23.48571 12.00000 80.12857
##
## Clustering vector:
## [1] 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 2764.2745 988.5257
## (between_SS / total_SS = 39.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# automovil
t_test_automovil <- t.test(automovil~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# tv_color
t_test_tvcolor <- t.test(tvcolor~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# video
t_test_video <- t.test(video~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# microondas
t_test_microond <- t.test(microond~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# lavavajillas
t_test_lavavaji <- t.test(lavavaji~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# telefono
t_test_telefono <- t.test(telefono~solucion_kmeans[["cluster"]],data = datos.caso3.grupos.kmeans)
# Resultados
resultados_t_test <- data.frame(
Variable = c("automovil", "tvcolor", "video", "microond", "lavavaji", "telefono"),
Grupo_1 = c(
t_test_automovil[["estimate"]][["mean in group 1"]],
t_test_tvcolor[["estimate"]][["mean in group 1"]],
t_test_video[["estimate"]][["mean in group 1"]],
t_test_microond[["estimate"]][["mean in group 1"]],
t_test_lavavaji[["estimate"]][["mean in group 1"]],
t_test_telefono[["estimate"]][["mean in group 1"]]
),
Grupo_2 = c(
t_test_automovil[["estimate"]][["mean in group 2"]],
t_test_tvcolor[["estimate"]][["mean in group 2"]],
t_test_video[["estimate"]][["mean in group 2"]],
t_test_microond[["estimate"]][["mean in group 2"]],
t_test_lavavaji[["estimate"]][["mean in group 2"]],
t_test_telefono[["estimate"]][["mean in group 2"]]
),
Prueba_t = c(
t_test_automovil[["statistic"]][["t"]],
t_test_tvcolor[["statistic"]][["t"]],
t_test_video[["statistic"]][["t"]],
t_test_microond[["statistic"]][["t"]],
t_test_lavavaji[["statistic"]][["t"]],
t_test_telefono[["statistic"]][["t"]]
),
P_value = c(
t_test_automovil[["p.value"]],
t_test_tvcolor[["p.value"]],
t_test_video[["p.value"]],
t_test_microond[["p.value"]],
t_test_lavavaji[["p.value"]],
t_test_telefono[["p.value"]]
)
)
resultados_t_test
## Variable Grupo_1 Grupo_2 Prueba_t P_value
## 1 automovil 70.65455 64.20000 4.309306 0.0006522080
## 2 tvcolor 98.25455 96.02857 2.300488 0.0562670096
## 3 video 65.90000 49.71429 4.808811 0.0001987238
## 4 microond 37.17273 23.48571 3.412048 0.0037381119
## 5 lavavaji 17.48182 12.00000 2.302470 0.0380855599
## 6 telefono 86.27273 80.12857 1.744970 0.1068307950
library(ggplot2)
mds_result <- cmdscale(matriz.dis.euclid.caso3, k = 2)
coordenadas_mds <- data.frame(Dim1 = mds_result[,1], Dim2 = mds_result[,2])
ciudades <- c("Espana", "Andalucia", "Aragon", "Asturias", "Baleares", "Canarias", "Cantabria",
"CyL", "CLM", "Cataluna", "Valencia",
"Extremadura", "Galicia", "Madrid", "Murcia", "Navarra", "P.Vasco", "Rioja")
coordenadas_mds$Ciudad <- ciudades
coordenadas_mds$Cluster <- factor(solucion_kmeans[["cluster"]])
ggplot(coordenadas_mds, aes(x = Dim1, y = Dim2, color = Cluster)) +
geom_point(size = 3) +
geom_text(aes(label = Ciudad), vjust = -0.5, hjust = 0.5, size = 3.5) +
labs(x = "Dimension 1", y = "Dimension 2",
title = "Visualizacion MDS") +
theme_minimal() +
theme(legend.position = "bottom")