A33_MAE118_GRUPO

Tarea 33

Métodos para el Análisis Económico GT-03

Docente: Carlos Ademir Pérez Alas

Ciclo II - 2024

Integrantes:

Carpaño Benites, Brandon Edenilson	CB22013
Jimenez Carrillo, Sabrina Elizabeth	JC22006
López Cabrera, Katherine Lissette	LC22029

Tabla

library(dplyr)
library(knitr)
library(kableExtra)

data <- tibble::tribble(
  ~"CC.AA.", ~"Automovil", ~"TV-Color", ~"Video", ~"Microondas", ~"Lavajillas", ~"Telefono",
  "Espana", 69.0, 97.6, 62.4, 32.3, 17.0, 85.2,
  "Andalucia", 66.7, 98.0, 82.7, 24.1, 12.7, 74.7,
  "Aragon", 67.2, 97.5, 56.8, 43.4, 20.6, 88.4,
  "Asturias", 63.7, 95.2, 52.1, 24.4, 13.3, 88.1,
  "Balerares", 71.9, 98.8, 62.4, 29.8, 10.1, 87.9,
  "Canarias", 72.7, 96.8, 68.4, 27.9, 5.8, 75.4,
  "Cantabria", 63.4, 94.9, 48.9, 36.5, 11.2, 80.5,
  "Castilla y Leon", 65.8, 97.1, 47.7, 28.1, 14.0, 85.0,
  "Cast.-La Mancha", 61.5, 97.3, 53.6, 21.7, 7.1, 72.9,
  "Cataluna", 70.4, 98.1, 71.1, 36.8, 19.8, 92.2,
  "Com.-Valenciana", 72.7, 98.4, 68.2, 26.6, 12.1, 84.4,
  "Extremadura", 60.5, 97.7, 43.7, 20.7, 11.7, 67.1,
  "Galicia", 65.5, 91.3, 42.7, 13.5, 14.6, 85.9,
  "Madrid", 74.0, 99.4, 76.3, 53.9, 32.3, 95.7,
  "Murcia", 69.0, 98.7, 59.3, 19.5, 12.1, 81.4,
  "Navarra", 76.4, 99.3, 60.6, 44.0, 20.6, 87.4,
  "Pais Vasco", 71.3, 98.3, 61.6, 45.7, 23.7, 94.3,
  "La Rioja", 64.9, 98.6, 54.4, 44.4, 17.6, 83.4
)

data %>% kbl(caption = "Tabla de datos", format = "html")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Tabla de datos
CC.AA.	Automovil	TV-Color	Video	Microondas	Lavajillas	Telefono
Espana	69.0	97.6	62.4	32.3	17.0	85.2
Andalucia	66.7	98.0	82.7	24.1	12.7	74.7
Aragon	67.2	97.5	56.8	43.4	20.6	88.4
Asturias	63.7	95.2	52.1	24.4	13.3	88.1
Balerares	71.9	98.8	62.4	29.8	10.1	87.9
Canarias	72.7	96.8	68.4	27.9	5.8	75.4
Cantabria	63.4	94.9	48.9	36.5	11.2	80.5
Castilla y Leon	65.8	97.1	47.7	28.1	14.0	85.0
Cast.-La Mancha	61.5	97.3	53.6	21.7	7.1	72.9
Cataluna	70.4	98.1	71.1	36.8	19.8	92.2
Com.-Valenciana	72.7	98.4	68.2	26.6	12.1	84.4
Extremadura	60.5	97.7	43.7	20.7	11.7	67.1
Galicia	65.5	91.3	42.7	13.5	14.6	85.9
Madrid	74.0	99.4	76.3	53.9	32.3	95.7
Murcia	69.0	98.7	59.3	19.5	12.1	81.4
Navarra	76.4	99.3	60.6	44.0	20.6	87.4
Pais Vasco	71.3	98.3	61.6	45.7	23.7	94.3
La Rioja	64.9	98.6	54.4	44.4	17.6	83.4

Detección de Outliers

library(MASS)
library(ggplot2)

data_numeric <- data %>% select_if(is.numeric)
# Distancia de Mahalanobis
mahalanobis_dist <- mahalanobis(
  x = data_numeric,
  center = colMeans(data_numeric, na.rm = TRUE),
  cov = cov(data_numeric, use = "pairwise.complete.obs")
)
p_value <- 0.001
threshold <- qchisq(1 - p_value, df = ncol(data_numeric))

# Los outliers
data <- data %>% mutate(Mahalanobis = mahalanobis_dist,
                        Outlier = ifelse(Mahalanobis > threshold, "Si", "No"))

# Visualización de la distancia de Mahalanobis
ggplot(data, aes(x = seq_along(Mahalanobis), y = Mahalanobis, color = Outlier)) +
  geom_point(size = 3) +
  geom_hline(yintercept = threshold, linetype = "dashed", color = "blue") +
  labs(
    title = "Deteccion de Outliers mediante la Distancia de Mahalanobis",
    x = "Observaciones",
    y = "Distancia de Mahalanobis"
  ) +
  theme_minimal()

Dendogramas

library(factoextra)
dist_matrix <- dist(scale(data_numeric))

# Método Ward
hc_ward <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward, main = "Metodo Ward", cex = 0.8)

# Método Complete
hc_complete <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete, main = "Metodo Complete", cex = 0.8)

# Método Average
hc_average <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average, main = "Metodo Average", cex = 0.8)

Generación de Índices

library(NbClust)
Datos.NbClust <- data[,c("Automovil", "TV-Color", "Video", "Microondas", "Lavajillas", "Telefono")]
res.wardD2 <- NbClust(Datos.NbClust, distance = "euclidean", min.nc=2, max.nc=15, method = "ward.D2", index = "alllong")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Centroides

library(cluster)
hc_ward <- hclust(dist_matrix, method = "ward.D2")
grupo.ward <- cutree(hc_ward, k = 2, h = NULL)
datos.caso3.grupos <- cbind(Datos.NbClust, grupo.ward)
datos.caso3.grupos$id <- NULL

round(aggregate(datos.caso3.grupos, list(grupo.ward), mean ), 2)

##   Group.1 Automovil TV-Color Video Microondas Lavajillas Telefono grupo.ward
## 1       1     70.52    98.29 65.35      35.70      17.03    85.87          1
## 2       2     63.40    95.58 48.12      24.15      11.98    79.92          2

datos.caso3.grupos.kmeans <- data_numeric
c1 <- c(66.87, 96.82, 56.01, 25.43, 11.81, 80.71)
c2 <- c(70.70, 98.53, 63.47, 44.70, 22.43, 90.23)
solucion <- kmeans(datos.caso3.grupos.kmeans, rbind(c1, c2))
# Centroides finales
centroides_fin <- solucion$centers
# Visualización de los resultados
kable(centroides_fin, caption = "Centroides Finales de los Clusters")

Centroides Finales de los Clusters
Automovil	TV-Color	Video	Microondas	Lavajillas	Telefono
66.86667	96.81667	57.67500	25.425	11.80833	80.70833
70.70000	98.53333	63.46667	44.700	22.43333	90.23333

# Método Ward
hc_ward_2 <- hclust(dist_matrix, method = "ward.D2")
fviz_dend(hc_ward_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Ward", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "red")

# Método averege
hc_average_2 <- hclust(dist_matrix, method = "average")
fviz_dend(hc_average_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Average", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "red")

# Método complete
hc_complete_2 <- hclust(dist_matrix, method = "complete")
fviz_dend(hc_complete_2, k = 2, cex = 0.7, 
          main = "Dendrograma - Método Complete", 
          rect = TRUE, rect_fill = FALSE, 
          rect_border = "red")

data_cluster <- as.factor(solucion$cluster)

resultados_t <- data.frame(
  Variable = character(),
  Grupo_1 = numeric(),
  Grupo_2 = numeric(),
  Prueba_t = numeric(),
  stringsAsFactors = FALSE
)

for (variable in colnames(datos.caso3.grupos.kmeans)) {
  grupo1 <- datos.caso3.grupos.kmeans[data_cluster == 1, variable, drop = TRUE]
  grupo2 <- datos.caso3.grupos.kmeans[data_cluster == 2, variable, drop = TRUE]
  prueba_t <- t.test(grupo1, grupo2)
  
  resultados_t <- rbind(resultados_t, data.frame(
    Variable = variable,
    Grupo_1_Media = mean(grupo1, na.rm = TRUE),
    Grupo_2_Media = mean(grupo2, na.rm = TRUE),
    Prueba_t = prueba_t$p.value
  ))
}

kable(resultados_t, caption = "Resultados de la Prueba t para Cada Variable")

Resultados de la Prueba t para Cada Variable
Variable	Grupo_1_Media	Grupo_2_Media	Prueba_t
Automovil	66.86667	70.70000	0.1000345
TV-Color	96.81667	98.53333	0.0238242
Video	57.67500	63.46667	0.2548580
Microondas	25.42500	44.70000	0.0000283
Lavajillas	11.80833	22.43333	0.0026595
Telefono	80.70833	90.23333	0.0034967