Análisis de Conglomerados con USArrests

Author

Mario Lorenzana

Preparación de los Datos

# Cargar paquetes necesarios
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.3.3
Warning: package 'tidyr' was built under R version 4.3.3
Warning: package 'forcats' was built under R version 4.3.2
Warning: package 'lubridate' was built under R version 4.3.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
Warning: package 'cluster' was built under R version 4.3.3
library(factoextra)
Warning: package 'factoextra' was built under R version 4.3.3
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dendextend)
Warning: package 'dendextend' was built under R version 4.3.3

---------------------
Welcome to dendextend version 1.18.1
Type citation('dendextend') for how to cite the package.

Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/

Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
You may ask questions at stackoverflow, use the r and dendextend tags: 
     https://stackoverflow.com/questions/tagged/dendextend

    To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
---------------------


Attaching package: 'dendextend'

The following object is masked from 'package:stats':

    cutree
library(corrplot)
Warning: package 'corrplot' was built under R version 4.5.0
corrplot 0.95 loaded
# Leer los datos desde ruta local (ajustar si es necesario)
df <- read.csv("USArrests.csv", row.names = 1)

# Vista previa
head(df)
           Murder Assault UrbanPop Rape
Alabama      13.2     236       58 21.2
Alaska       10.0     263       48 44.5
Arizona       8.1     294       80 31.0
Arkansas      8.8     190       50 19.5
California    9.0     276       91 40.6
Colorado      7.9     204       78 38.7
df
               Murder Assault UrbanPop Rape
Alabama          13.2     236       58 21.2
Alaska           10.0     263       48 44.5
Arizona           8.1     294       80 31.0
Arkansas          8.8     190       50 19.5
California        9.0     276       91 40.6
Colorado          7.9     204       78 38.7
Connecticut       3.3     110       77 11.1
Delaware          5.9     238       72 15.8
Florida          15.4     335       80 31.9
Georgia          17.4     211       60 25.8
Hawaii            5.3      46       83 20.2
Idaho             2.6     120       54 14.2
Illinois         10.4     249       83 24.0
Indiana           7.2     113       65 21.0
Iowa              2.2      56       57 11.3
Kansas            6.0     115       66 18.0
Kentucky          9.7     109       52 16.3
Louisiana        15.4     249       66 22.2
Maine             2.1      83       51  7.8
Maryland         11.3     300       67 27.8
Massachusetts     4.4     149       85 16.3
Michigan         12.1     255       74 35.1
Minnesota         2.7      72       66 14.9
Mississippi      16.1     259       44 17.1
Missouri          9.0     178       70 28.2
Montana           6.0     109       53 16.4
Nebraska          4.3     102       62 16.5
Nevada           12.2     252       81 46.0
New Hampshire     2.1      57       56  9.5
New Jersey        7.4     159       89 18.8
New Mexico       11.4     285       70 32.1
New York         11.1     254       86 26.1
North Carolina   13.0     337       45 16.1
North Dakota      0.8      45       44  7.3
Ohio              7.3     120       75 21.4
Oklahoma          6.6     151       68 20.0
Oregon            4.9     159       67 29.3
Pennsylvania      6.3     106       72 14.9
Rhode Island      3.4     174       87  8.3
South Carolina   14.4     279       48 22.5
South Dakota      3.8      86       45 12.8
Tennessee        13.2     188       59 26.9
Texas            12.7     201       80 25.5
Utah              3.2     120       80 22.9
Vermont           2.2      48       32 11.2
Virginia          8.5     156       63 20.7
Washington        4.0     145       73 26.2
West Virginia     5.7      81       39  9.3
Wisconsin         2.6      53       66 10.8
Wyoming           6.8     161       60 15.6
# Verificar tipo de variables
glimpse(df)
Rows: 50
Columns: 4
$ Murder   <dbl> 13.2, 10.0, 8.1, 8.8, 9.0, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.…
$ Assault  <int> 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 24…
$ UrbanPop <int> 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 6…
$ Rape     <dbl> 21.2, 44.5, 31.0, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 2…
# Escalado Z-score
df_scaled <- scale(df)
head(df_scaled)
               Murder   Assault   UrbanPop         Rape
Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
Arizona    0.07163341 1.4788032  0.9989801  1.042878388
Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144  1.7589234  2.067820292
Colorado   0.02571456 0.3988593  0.8608085  1.864967207

Detección de Outliers

# Boxplots por variable para detectar outliers
par(mfrow=c(2,2))
for (i in 1:4) {
  boxplot(df_scaled[, i], main = colnames(df_scaled)[i])
}

par(mfrow=c(1,1))

Análisis Exploratorio

# Correlación entre variables
corrplot(cor(df_scaled), method = "circle")

prueba de normalidad

Prueba de Shapiro-wilk

# Shapiro-Wilk para cada variable (antes de escalar)
for (var in colnames(df)) {
  cat("Variable:", var, "\n")
  print(shapiro.test(df[[var]]))
  cat("\n")
}
Variable: Murder 

    Shapiro-Wilk normality test

data:  df[[var]]
W = 0.95703, p-value = 0.06674


Variable: Assault 

    Shapiro-Wilk normality test

data:  df[[var]]
W = 0.95181, p-value = 0.04052


Variable: UrbanPop 

    Shapiro-Wilk normality test

data:  df[[var]]
W = 0.97714, p-value = 0.4385


Variable: Rape 

    Shapiro-Wilk normality test

data:  df[[var]]
W = 0.94674, p-value = 0.0251
# Shapiro-Wilk para variables escaladas (Z-score)
for (var in colnames(df_scaled)) {
  cat("Variable escalada:", var, "\n")
  print(shapiro.test(df_scaled[, var]))
  cat("\n")
}
Variable escalada: Murder 

    Shapiro-Wilk normality test

data:  df_scaled[, var]
W = 0.95703, p-value = 0.06674


Variable escalada: Assault 

    Shapiro-Wilk normality test

data:  df_scaled[, var]
W = 0.95181, p-value = 0.04052


Variable escalada: UrbanPop 

    Shapiro-Wilk normality test

data:  df_scaled[, var]
W = 0.97714, p-value = 0.4385


Variable escalada: Rape 

    Shapiro-Wilk normality test

data:  df_scaled[, var]
W = 0.94674, p-value = 0.0251

visualización fraficamente

par(mfrow = c(2,2))
for (i in 1:ncol(df_scaled)) {
  qqnorm(df_scaled[, i], main = paste("QQ Plot:", colnames(df_scaled)[i]))
  qqline(df_scaled[, i], col = "red")
}

par(mfrow = c(1,1))

Selección del Número de Clústeres

# Método del codo para K-means
fviz_nbclust(df_scaled, kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2)

# Método de la silueta para K-means
fviz_nbclust(df_scaled, kmeans, method = "silhouette")

Selección del Número de Clústeres

# Determinar k óptimo para k-means y jerárquico
library(NbClust)

set.seed(123)
nb_kmeans <- NbClust(df_scaled, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 11 proposed 2 as the best number of clusters 
* 2 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 7 proposed 6 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
nb_hc <- NbClust(df_scaled, distance = "euclidean", min.nc = 2, max.nc = 10, method = "ward.D2")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 3 proposed 3 as the best number of clusters 
* 8 proposed 4 as the best number of clusters 
* 1 proposed 7 as the best number of clusters 
* 2 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
k_opt_kmeans <- as.numeric(names(sort(table(nb_kmeans$Best.nc[1,]), decreasing = TRUE)[1]))
k_opt_hc <- as.numeric(names(sort(table(nb_hc$Best.nc[1,]), decreasing = TRUE)[1]))

cat("K óptimo para K-means:", k_opt_kmeans, "\n")
K óptimo para K-means: 2 
cat("K óptimo para Jerárquico (Ward):", k_opt_hc, "\n")
K óptimo para Jerárquico (Ward): 2 

Método No Jerárquico: PAM

pam.res <- pam(df_scaled, k = 2)
fviz_cluster(pam.res,
             palette = c("#EE82EE", "#9ACD32"),
             ellipse.type = "t", ggtheme = theme_classic())

Método No Jerárquico: CLARA

set.seed(123)
clara.res <- clara(df_scaled, k = 2, samples = 50, pamLike = TRUE)
fviz_cluster(clara.res,
             palette = c("#EE82EE", "#9ACD32"),
             ellipse.type = "t", pointsize = 1,
             ggtheme = theme_classic())

Método Jerárquico Aglomerativo: Ward

res.dist <- dist(df_scaled, method = "euclidean")
res.hc <- hclust(res.dist, method = "ward.D2")
fviz_dend(res.hc, cex = 0.5, k = 2,
          k_colors = c("#8B0000", "#006400"),
          rect = TRUE)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

Método Jerárquico Aglomerativo: Agnes

res.agnes <- agnes(df, stand = TRUE, metric = "euclidean", method = "ward")
fviz_dend(res.agnes, cex = 0.6, k = 4)

Método Jerárquico Disociativo: Diana

res.diana <- diana(df, stand = TRUE, metric = "euclidean")
fviz_dend(res.diana, cex = 0.6, k = 2)

Comparación de Dendrogramas

df10 <- df_scaled[sample(1:nrow(df_scaled), 10), ]
d1 <- hclust(dist(df10), method = "average") %>% as.dendrogram()
d2 <- hclust(dist(df10), method = "ward.D2") %>% as.dendrogram()
dendlist(d1, d2) %>% tanglegram()
Loading required namespace: colorspace

Conclusiones


 Se verificaron y escalaron las variables. 
 Se detectaron valores atípicos mediante boxplots. 
 Se aplicaron métodos jerárquicos y no jerárquicos. 
 Se compararon visualmente resultados mediante dendrogramas. 
 El número óptimo de clústeres es 4 y 2 según método del codo y silueta.

Reducción de Dimensionalidad con PCA

# PCA para reducción de dimensionalidad
res.pca <- prcomp(df_scaled, center = TRUE, scale. = TRUE)

# Visualizar varianza explicada
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 60))

# Contribución de variables a los componentes principales
fviz_pca_var(res.pca, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

# Visualización de individuos en el plano de PCA
fviz_pca_ind(res.pca,
             col.ind = "cos2", # Calidad de la representación
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

Validación Cruzada Interna

km.res <- kmeans(df_scaled, centers = 2)

# Validación mediante índice de silueta
sil <- silhouette(km.res$cluster, dist(df_scaled))
fviz_silhouette(sil)
  cluster size ave.sil.width
1       1   30          0.43
2       2   20          0.37

summary(sil)
Silhouette of 50 units in 2 clusters from silhouette.default(x = km.res$cluster, dist = dist(df_scaled)) :
 Cluster sizes and average silhouette widths:
       30        20 
0.4335417 0.3709100 
Individual silhouette widths:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1135  0.3354  0.4071  0.4085  0.5208  0.5986 

Revisión de Supuestos Estadísticos

# Verificar normalidad univariada (opcional para PCA y clustering)
par(mfrow = c(2,2))
for (i in 1:ncol(df_scaled)) {
  qqnorm(df_scaled[,i], main = paste("QQ Plot:", colnames(df_scaled)[i]))
  qqline(df_scaled[,i])
}

par(mfrow = c(1,1))

# Evaluar correlación (para posibles redundancias)
corrplot(cor(df_scaled), method = "number")

# Revisar media y desviación estándar
summary(df_scaled)
     Murder           Assault           UrbanPop             Rape        
 Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
 1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
 Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
 Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
 3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
 Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444  
apply (df_scaled, 2, sd)
  Murder  Assault UrbanPop     Rape 
       1        1        1        1 
df_t <- t(df_scaled)

res_clust_var <- hclust(dist(df_t), method = "ward.D2")
res_clust_var

Call:
hclust(d = dist(df_t), method = "ward.D2")

Cluster method   : ward.D2 
Distance         : euclidean 
Number of objects: 4 
# Cortar en 2 clústeres
grupo_vars <- cutree(res_clust_var, k = 2)

# Ver a qué grupo pertenece cada variable
grupo_vars
  Murder  Assault UrbanPop     Rape 
       1        1        2        1 
# Renombrar con etiquetas personalizadas
nombres_clusters_vars <- factor(grupo_vars,
                                levels = c(1, 2),
                                labels = c("Delitos Violentos", "Demografía"))

# Asignar nombres a las variables
names(nombres_clusters_vars) <- names(grupo_vars)

# Ver resultado
nombres_clusters_vars
           Murder           Assault          UrbanPop              Rape 
Delitos Violentos Delitos Violentos        Demografía Delitos Violentos 
Levels: Delitos Violentos Demografía