A33_MAE118_GRUPO

Tarea 33

Métodos para el Análisis Económico GT-03

Docente: Carlos Ademir Pérez Alas

Ciclo II - 2024

Integrantes:

Carpaño Benites, Brandon Edenilson CB22013
Jimenez Carrillo, Sabrina Elizabeth JC22006
López Cabrera, Katherine Lissette LC22029

Tabla

library(dplyr)
library(knitr)
library(kableExtra)

data <- tibble::tribble(
  ~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
  "Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
  "Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
  "Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
  "Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
  "Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
  "Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
  "Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
  "Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
  "Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
  "Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
  "Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
  "Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
  "Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
  "Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
  "Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
  "Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
  "Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
  "La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)

data %>% kbl(caption = "Tabla de datos", format = "html")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Tabla de datos
CC.AA. Automovil TV-Color Video Microondas Lavajillas Telefono
Espana 69.0 97.6 62.4 32.3 17.0 85.2
Andalucia 66.7 98.0 82.7 24.1 12.7 74.7
Aragon 67.2 97.5 56.8 43.4 20.6 88.4
Asturias 63.7 95.2 52.1 24.4 13.3 88.1
Balerares 71.9 98.8 62.4 29.8 10.1 87.9
Canarias 72.7 96.8 68.4 27.9 5.8 75.4
Cantabria 63.4 94.9 48.9 36.5 11.2 80.5
Castilla y Leon 65.8 97.1 47.7 28.1 14.0 85.0
Cast.-La Mancha 61.5 97.3 53.6 21.7 7.1 72.9
Cataluna 70.4 98.1 71.1 36.8 19.8 92.2
Com.-Valenciana 72.7 98.4 68.2 26.6 12.1 84.4
Extremadura 60.5 97.7 43.7 20.7 11.7 67.1
Galicia 65.5 91.3 42.7 13.5 14.6 85.9
Madrid 74.0 99.4 76.3 53.9 32.3 95.7
Murcia 69.0 98.7 59.3 19.5 12.1 81.4
Navarra 76.4 99.3 60.6 44.0 20.6 87.4
Pais Vasco 71.3 98.3 61.6 45.7 23.7 94.3
La Rioja 64.9 98.6 54.4 44.4 17.6 83.4

Detección de Outliers

library(MASS)
library(ggplot2)

data_numeric <- data %>% select_if(is.numeric)
# Distancia de Mahalanobis
mahalanobis_dist <- mahalanobis(
  x = data_numeric,
  center = colMeans(data_numeric, na.rm = TRUE),
  cov = cov(data_numeric, use = "pairwise.complete.obs")
)
p_value <- 0.001
threshold <- qchisq(1 - p_value, df = ncol(data_numeric))

# Los outliers
data <- data %>% mutate(Mahalanobis = mahalanobis_dist,
                        Outlier = ifelse(Mahalanobis > threshold, "Si", "No"))

# Visualización de la distancia de Mahalanobis
ggplot(data, aes(x = seq_along(Mahalanobis), y = Mahalanobis, color = Outlier)) +
  geom_point(size = 3) +
  geom_hline(yintercept = threshold, linetype = "dashed", color = "blue") +
  labs(
    title = "Deteccion de Outliers mediante la Distancia de Mahalanobis",
    x = "Observaciones",
    y = "Distancia de Mahalanobis"
  ) +
  theme_minimal()

Dendogramas

library(factoextra)
dist_matrix <- dist(scale(data_numeric))

# Método Ward
hc_ward <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward, main = "Metodo Ward", cex = 0.8)

# Método Complete
hc_complete <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete, main = "Metodo Complete", cex = 0.8)

# Método Average
hc_average <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average, main = "Metodo Average", cex = 0.8)

Generación de Índices

library(NbClust)
Datos.NbClust <- data[,c("Automovil", "TV-Color", "Video", "Microondas", "Lavajillas", "Telefono")]
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides

library(cluster)
hc_ward <- hclust(dist_matrix, method = "ward.D2")
grupo.ward <- cutree(hc_ward, k = 2, h = NULL)
datos.caso3.grupos <- cbind(Datos.NbClust, grupo.ward)
datos.caso3.grupos$id <- NULL

round(aggregate(datos.caso3.grupos, list(grupo.ward), mean ), 2)
##   Group.1 Automovil TV-Color Video Microondas Lavajillas Telefono grupo.ward
## 1       1     70.52    98.29 65.35      35.70      17.03    85.87          1
## 2       2     63.40    95.58 48.12      24.15      11.98    79.92          2
datos.caso3.grupos.kmeans <- data_numeric
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)
solucion <- kmeans(datos.caso3.grupos.kmeans, rbind(c1, c2))
# Centroides finales
centroides_fin <- solucion$centers
# Visualización de los resultados
kable(centroides_fin, caption = "Centroides Finales de los Clusters")
Centroides Finales de los Clusters
Automovil TV-Color Video Microondas Lavajillas Telefono
66.86667 96.81667 57.67500 25.425 11.80833 80.70833
70.70000 98.53333 63.46667 44.700 22.43333 90.23333
# Método Ward
hc_ward_2 <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Ward", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "red")

# Método averege
hc_average_2 <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Average", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "red")

# Método complete
hc_complete_2 <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Complete", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "red")

data_cluster <- as.factor(solucion$cluster)

resultados_t <- data.frame(
  Variable = character(),
  Grupo_1 = numeric(),
  Grupo_2 = numeric(),
  Prueba_t = numeric(),
  stringsAsFactors = FALSE
)

for (variable in colnames(datos.caso3.grupos.kmeans)) {
  grupo1 <- datos.caso3.grupos.kmeans[data_cluster == 1, variable, drop = TRUE]
  grupo2 <- datos.caso3.grupos.kmeans[data_cluster == 2, variable, drop = TRUE]
  prueba_t <- t.test(grupo1, grupo2)
  
  resultados_t <- rbind(resultados_t, data.frame(
    Variable = variable,
    Grupo_1_Media = mean(grupo1, na.rm = TRUE),
    Grupo_2_Media = mean(grupo2, na.rm = TRUE),
    Prueba_t = prueba_t$p.value
  ))
}

kable(resultados_t, caption = "Resultados de la Prueba t para Cada Variable")
Resultados de la Prueba t para Cada Variable
Variable Grupo_1_Media Grupo_2_Media Prueba_t
Automovil 66.86667 70.70000 0.1000345
TV-Color 96.81667 98.53333 0.0238242
Video 57.67500 63.46667 0.2548580
Microondas 25.42500 44.70000 0.0000283
Lavajillas 11.80833 22.43333 0.0026595
Telefono 80.70833 90.23333 0.0034967