A33_MAE118_GRUPO
Tarea 33
Métodos para el Análisis Económico GT-03
Docente: Carlos Ademir Pérez Alas
Ciclo II - 2024
Integrantes:
Carpaño Benites, Brandon Edenilson | CB22013 |
Jimenez Carrillo, Sabrina Elizabeth | JC22006 |
López Cabrera, Katherine Lissette | LC22029 |
Tabla
library(dplyr)
library(knitr)
library(kableExtra)
data <- tibble::tribble(
~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
"Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
"Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
"Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
"Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
"Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
"Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
"Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
"Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
"Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
"Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
"Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
"Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
"Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
"Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
"Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
"Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
"Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
"La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)
data %>% kbl(caption = "Tabla de datos", format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
CC.AA. | Automovil | TV-Color | Video | Microondas | Lavajillas | Telefono |
---|---|---|---|---|---|---|
Espana | 69.0 | 97.6 | 62.4 | 32.3 | 17.0 | 85.2 |
Andalucia | 66.7 | 98.0 | 82.7 | 24.1 | 12.7 | 74.7 |
Aragon | 67.2 | 97.5 | 56.8 | 43.4 | 20.6 | 88.4 |
Asturias | 63.7 | 95.2 | 52.1 | 24.4 | 13.3 | 88.1 |
Balerares | 71.9 | 98.8 | 62.4 | 29.8 | 10.1 | 87.9 |
Canarias | 72.7 | 96.8 | 68.4 | 27.9 | 5.8 | 75.4 |
Cantabria | 63.4 | 94.9 | 48.9 | 36.5 | 11.2 | 80.5 |
Castilla y Leon | 65.8 | 97.1 | 47.7 | 28.1 | 14.0 | 85.0 |
Cast.-La Mancha | 61.5 | 97.3 | 53.6 | 21.7 | 7.1 | 72.9 |
Cataluna | 70.4 | 98.1 | 71.1 | 36.8 | 19.8 | 92.2 |
Com.-Valenciana | 72.7 | 98.4 | 68.2 | 26.6 | 12.1 | 84.4 |
Extremadura | 60.5 | 97.7 | 43.7 | 20.7 | 11.7 | 67.1 |
Galicia | 65.5 | 91.3 | 42.7 | 13.5 | 14.6 | 85.9 |
Madrid | 74.0 | 99.4 | 76.3 | 53.9 | 32.3 | 95.7 |
Murcia | 69.0 | 98.7 | 59.3 | 19.5 | 12.1 | 81.4 |
Navarra | 76.4 | 99.3 | 60.6 | 44.0 | 20.6 | 87.4 |
Pais Vasco | 71.3 | 98.3 | 61.6 | 45.7 | 23.7 | 94.3 |
La Rioja | 64.9 | 98.6 | 54.4 | 44.4 | 17.6 | 83.4 |
Detección de Outliers
library(MASS)
library(ggplot2)
data_numeric <- data %>% select_if(is.numeric)
# Distancia de Mahalanobis
mahalanobis_dist <- mahalanobis(
x = data_numeric,
center = colMeans(data_numeric, na.rm = TRUE),
cov = cov(data_numeric, use = "pairwise.complete.obs")
)
p_value <- 0.001
threshold <- qchisq(1 - p_value, df = ncol(data_numeric))
# Los outliers
data <- data %>% mutate(Mahalanobis = mahalanobis_dist,
Outlier = ifelse(Mahalanobis > threshold, "Si", "No"))
# Visualización de la distancia de Mahalanobis
ggplot(data, aes(x = seq_along(Mahalanobis), y = Mahalanobis, color = Outlier)) +
geom_point(size = 3) +
geom_hline(yintercept = threshold, linetype = "dashed", color = "blue") +
labs(
title = "Deteccion de Outliers mediante la Distancia de Mahalanobis",
x = "Observaciones",
y = "Distancia de Mahalanobis"
) +
theme_minimal()
Dendogramas
library(factoextra)
dist_matrix <- dist(scale(data_numeric))
# Método Ward
hc_ward <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward, main = "Metodo Ward", cex = 0.8)
# Método Complete
hc_complete <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete, main = "Metodo Complete", cex = 0.8)
# Método Average
hc_average <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average, main = "Metodo Average", cex = 0.8)
Generación de Índices
library(NbClust)
Datos.NbClust <- data[,c("Automovil", "TV-Color", "Video", "Microondas", "Lavajillas", "Telefono")]
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15, method = "ward.D2", index = "alllong")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Centroides
library(cluster)
hc_ward <- hclust(dist_matrix, method = "ward.D2")
grupo.ward <- cutree(hc_ward, k = 2, h = NULL)
datos.caso3.grupos <- cbind(Datos.NbClust, grupo.ward)
datos.caso3.grupos$id <- NULL
round(aggregate(datos.caso3.grupos, list(grupo.ward), mean ), 2)
## Group.1 Automovil TV-Color Video Microondas Lavajillas Telefono grupo.ward
## 1 1 70.52 98.29 65.35 35.70 17.03 85.87 1
## 2 2 63.40 95.58 48.12 24.15 11.98 79.92 2
datos.caso3.grupos.kmeans <- data_numeric
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)
solucion <- kmeans(datos.caso3.grupos.kmeans, rbind(c1, c2))
# Centroides finales
centroides_fin <- solucion$centers
# Visualización de los resultados
kable(centroides_fin, caption = "Centroides Finales de los Clusters")
Automovil | TV-Color | Video | Microondas | Lavajillas | Telefono |
---|---|---|---|---|---|
66.86667 | 96.81667 | 57.67500 | 25.425 | 11.80833 | 80.70833 |
70.70000 | 98.53333 | 63.46667 | 44.700 | 22.43333 | 90.23333 |
# Método Ward
hc_ward_2 <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward_2, k = 2, cex = 0.7,
main = "Dendrograma - Método Ward",
rect = TRUE, rect_fill = FALSE,
rect_border = "red")
# Método averege
hc_average_2 <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average_2, k = 2, cex = 0.7,
main = "Dendrograma - Método Average",
rect = TRUE, rect_fill = FALSE,
rect_border = "red")
# Método complete
hc_complete_2 <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete_2, k = 2, cex = 0.7,
main = "Dendrograma - Método Complete",
rect = TRUE, rect_fill = FALSE,
rect_border = "red")
data_cluster <- as.factor(solucion$cluster)
resultados_t <- data.frame(
Variable = character(),
Grupo_1 = numeric(),
Grupo_2 = numeric(),
Prueba_t = numeric(),
stringsAsFactors = FALSE
)
for (variable in colnames(datos.caso3.grupos.kmeans)) {
grupo1 <- datos.caso3.grupos.kmeans[data_cluster == 1, variable, drop = TRUE]
grupo2 <- datos.caso3.grupos.kmeans[data_cluster == 2, variable, drop = TRUE]
prueba_t <- t.test(grupo1, grupo2)
resultados_t <- rbind(resultados_t, data.frame(
Variable = variable,
Grupo_1_Media = mean(grupo1, na.rm = TRUE),
Grupo_2_Media = mean(grupo2, na.rm = TRUE),
Prueba_t = prueba_t$p.value
))
}
kable(resultados_t, caption = "Resultados de la Prueba t para Cada Variable")
Variable | Grupo_1_Media | Grupo_2_Media | Prueba_t |
---|---|---|---|
Automovil | 66.86667 | 70.70000 | 0.1000345 |
TV-Color | 96.81667 | 98.53333 | 0.0238242 |
Video | 57.67500 | 63.46667 | 0.2548580 |
Microondas | 25.42500 | 44.70000 | 0.0000283 |
Lavajillas | 11.80833 | 22.43333 | 0.0026595 |
Telefono | 80.70833 | 90.23333 | 0.0034967 |