Tarea_#

Author

Leydi

CLUSTERING - MALL CUSTOMERS DATASET

library(pacman)
Warning: package 'pacman' was built under R version 4.5.2
p_load(rio, cluster, factoextra, tidyverse, ggrepel, scatterplot3d, 
       tictoc, NbClust, dbscan, colorspace)

options(scipen = 999)
options(digits = 3)
rm(list = ls())

Exploración

mall <- read.csv("Mall_Customers.csv")
# dimensión 
print(dim(mall))
[1] 200   5
# estructura
str(mall)
'data.frame':   200 obs. of  5 variables:
 $ CustomerID            : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Genre                 : chr  "Male" "Male" "Female" "Female" ...
 $ Age                   : int  19 21 20 23 31 22 35 23 64 30 ...
 $ Annual.Income..k..    : int  15 15 16 16 17 17 18 18 19 19 ...
 $ Spending.Score..1.100.: int  39 81 6 77 40 76 6 94 3 72 ...
# resumen
summary(mall)
   CustomerID       Genre                Age       Annual.Income..k..
 Min.   :  1.0   Length:200         Min.   :18.0   Min.   : 15.0     
 1st Qu.: 50.8   Class :character   1st Qu.:28.8   1st Qu.: 41.5     
 Median :100.5   Mode  :character   Median :36.0   Median : 61.5     
 Mean   :100.5                      Mean   :38.9   Mean   : 60.6     
 3rd Qu.:150.2                      3rd Qu.:49.0   3rd Qu.: 78.0     
 Max.   :200.0                      Max.   :70.0   Max.   :137.0     
 Spending.Score..1.100.
 Min.   : 1.0          
 1st Qu.:34.8          
 Median :50.0          
 Mean   :50.2          
 3rd Qu.:73.0          
 Max.   :99.0          
# missings
colSums(is.na(mall))
            CustomerID                  Genre                    Age 
                     0                      0                      0 
    Annual.Income..k.. Spending.Score..1.100. 
                     0                      0 

Preparación

# Seleccionar variables numéricas para clustering
subdata <- mall %>%
  select(Age, Annual.Income..k.., Spending.Score..1.100.)

# Renombrar columnas
colnames(subdata) <- c("Age", "Income", "Spending")

# Estandarizar las variables
subdata <- as.data.frame(scale(subdata))

head(subdata)
     Age Income Spending
1 -1.421  -1.73   -0.434
2 -1.278  -1.73    1.193
3 -1.349  -1.70   -1.712
4 -1.135  -1.70    1.038
5 -0.562  -1.66   -0.395
6 -1.206  -1.66    0.999
summary(subdata)
      Age             Income           Spending       
 Min.   :-1.493   Min.   :-1.7346   Min.   :-1.90524  
 1st Qu.:-0.723   1st Qu.:-0.7257   1st Qu.:-0.59829  
 Median :-0.204   Median : 0.0358   Median :-0.00774  
 Mean   : 0.000   Mean   : 0.0000   Mean   : 0.00000  
 3rd Qu.: 0.727   3rd Qu.: 0.6640   3rd Qu.: 0.88292  
 Max.   : 2.230   Max.   : 2.9104   Max.   : 1.88975  
rownames(mall) <- mall$CustomerID
names(mall)
[1] "CustomerID"             "Genre"                  "Age"                   
[4] "Annual.Income..k.."     "Spending.Score..1.100."
# renombrar
names(mall)[names(mall) == "Annual.Income..k.."] <- "annual_income"
names(mall)[names(mall) == "Spending.Score..1.100."] <- "spending_score"
names(mall) <- tolower(names(mall))
# subset
mall_subset <- mall
mall_subset <- subset(mall_subset, select = -customerid)
# one-hot encoding
mall_subset <- cbind(mall_subset, 
                     model.matrix(~ genre - 1 , 
                                  data = mall_subset
))
names(mall_subset) <- tolower(names(mall_subset))

# drop genre
mall_subset <- subset(mall_subset, select = -genre)

# drop gernemale por multicolinialidad
mall_subset <- subset(mall_subset, select = -genremale)

# solo numeric
mall_subset_numeric <- subset(mall_subset, select = -genrefemale)
# estandarización
mall_subset_numeric <- scale(mall_subset_numeric)

Usando solo variables continuas: mall_subset_numeric

K óptimo

A. Suma de Cuadrados (Método Elbow)

tic()
set.seed(20202209)
fviz_nbclust(mall_subset_numeric, kmeans, method = "wss", k.max = 10) +
  geom_vline(xintercept = 5, linetype = 2, col = "red") +
  labs(subtitle = "Método Elbow") + 
  theme_bw()

toc()
0.37 sec elapsed

B. gráfico de silueta

tic()
set.seed(20202209)
fviz_nbclust(mall_subset_numeric, kmeans, 
             method = "silhouette", k.max = 15) +
  labs(subtitle = "Silhouette method") +
  theme_bw()

toc()
0.28 sec elapsed

C. paquete NbClust: Un resumen

set.seed(20202209)
res.nbclust <- NbClust(mall_subset_numeric, distance = "euclidean",
                       min.nc = 2, max.nc = 15, 
                       method = "kmeans", index = "all")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 4 proposed 2 as the best number of clusters 
* 3 proposed 3 as the best number of clusters 
* 2 proposed 4 as the best number of clusters 
* 4 proposed 5 as the best number of clusters 
* 4 proposed 6 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 
* 2 proposed 12 as the best number of clusters 
* 1 proposed 13 as the best number of clusters 
* 1 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 

Aplicación de clustering

k_optimo <- 2

Algoritmo 1: K-MEANS

set.seed(20202209)
tic()
km <- kmeans(mall_subset_numeric, 
             centers = k_optimo,
             iter.max = 100,
             nstart = 25,
             algorithm = "Lloyd")
toc()
0 sec elapsed
# Explorar el cluster creado
cat("\nTamaños de clusters:\n")

Tamaños de clusters:
print(km$size)
[1] 103  97
cat("\nProporción de observaciones:\n")

Proporción de observaciones:
print(prop.table(km$size))
[1] 0.515 0.485
cat("\nCentros de clusters:\n")

Centros de clusters:
print(km$centers)
     age annual_income spending_score
1  0.707      -0.00247         -0.698
2 -0.751       0.00262          0.741
# Visualización K-means
fviz_cluster(km, data = mall_subset_numeric, ellipse.type = "convex") +
  theme_classic() +
  labs(title = "K-means Clustering (k=5)")

# Validación usando índice de Silueta
km_clusters <- eclust(x = mall_subset_numeric, FUNcluster = "kmeans", 
                      k = k_optimo, seed = 20202209,
                      hc_metric = "euclidean",
                      nstart = 25,
                      graph = FALSE)
fviz_silhouette(sil.obj = km_clusters, 
                print.summary = TRUE, 
                palette = "jco",
                ggtheme = theme_classic()) +
  labs(title = "Gráfico de Silueta - K-means")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.
  cluster size ave.sil.width
1       1  103          0.30
2       2   97          0.37

# Guardar ancho promedio de silueta
kmeans_avg_sil <- km_clusters$silinfo$avg.width
cat("\nAncho promedio de silueta K-means:", round(kmeans_avg_sil, 4), "\n")

Ancho promedio de silueta K-means: 0.336 

Algoritmo 2: Clustering Jerárquico (WARD)

# Calcular matriz de distancias
distancias <- daisy(mall_subset_numeric, metric = "euclidean")

# Aplicar clustering jerárquico
set.seed(20202209)
aglomerativo <- hcut(distancias, 
                     k = k_optimo,
                     hc_func = 'agnes',
                     hc_method = "ward.D2")
# Visualización Jerárquico
fviz_cluster(object = list(data = mall_subset_numeric, cluster = aglomerativo$cluster),
             ellipse.type = "convex",
             geom = "point") + 
  theme_classic() +
  labs(title = "Clustering Jerárquico Ward (k=2)")

# Validación con silueta
hclust_sil <- silhouette(aglomerativo$cluster, distancias)
hclust_avg_sil <- mean(hclust_sil[, 3])

cat("Tamaños de clusters Jerárquico:\n")
Tamaños de clusters Jerárquico:
print(table(aglomerativo$cluster))

  1   2 
 95 105 
fviz_silhouette(hclust_sil, 
                print.summary = TRUE, 
                palette = "jco",
                ggtheme = theme_classic()) +
  labs(title = "Gráfico de Silueta - Jerárquico")
  cluster size ave.sil.width
1       1   95          0.30
2       2  105          0.34

Algoritmo 3: DBSCAN

# determinando epsilon
dbscan::kNNdistplot(mall_subset_numeric, k = 2)
abline(h = 0.6, lty = 2, col = "red")
title("k-NN Distance Plot")

eps_optimo <- 0.6
# regla de dimension n = 3
minpts_optimo <- 5
set.seed(20202209)
tic()
dbscan_result <- fpc::dbscan(data = mall_subset_numeric, 
                             eps = eps_optimo, 
                             MinPts = minpts_optimo)
toc()
0.58 sec elapsed
# número de clusters (excluyendo ruido)
n_clusters_dbscan <- max(dbscan_result$cluster)
n_ruido <- sum(dbscan_result$cluster == 0)

cat("\nNúmero de clusters encontrados:", n_clusters_dbscan, "\n")

Número de clusters encontrados: 2 
cat("Número de puntos clasificados como ruido:", n_ruido, 
    "(", round(n_ruido/nrow(mall_subset_numeric)*100, 1), "%)\n\n")
Número de puntos clasificados como ruido: 28 ( 14 %)
# Visualización DBSCAN
fviz_cluster(dbscan_result, 
             mall_subset_numeric, 
             stand = FALSE, 
             ellipse = FALSE, 
             geom = "point",
             palette = "jco") + 
  labs(title = "DBSCAN Clustering",
       subtitle = paste("eps =", eps_optimo, ", minPts =", minpts_optimo)) + 
  theme_classic()

Comparación

library(gridExtra)
Warning: package 'gridExtra' was built under R version 4.5.2

Adjuntando el paquete: 'gridExtra'
The following object is masked from 'package:dplyr':

    combine
p1 <- fviz_cluster(km, data = mall_subset_numeric, ellipse.type = "convex",
                   geom = "point", palette = "jco") +
  theme_minimal() +
  labs(title = "K-means",
       subtitle = paste("Silueta:", round(kmeans_avg_sil, 3)))

p2 <- fviz_cluster(list(data = mall_subset_numeric, cluster = aglomerativo$cluster),
                   ellipse.type = "convex", geom = "point", 
                   palette = "jco") +
  theme_minimal() +
  labs(title = "Jerárquico Ward",
       subtitle = paste("Silueta:", round(hclust_avg_sil, 3)))

p3 <- fviz_cluster(dbscan_result, data = mall_subset_numeric,
                   geom = "point", palette = "jco",
                   stand = FALSE, ellipse = FALSE) + 
  theme_minimal() +
  labs(title = "DBSCAN",
       subtitle = paste("Clusters:", n_clusters_dbscan, "| Ruido:", n_ruido))

grid.arrange(p1, p2, p3, ncol = 3)

Caracterización (mejor algoritmo= DBSCAN)

mall_db <- mall %>%
  mutate(cluster_dbscan = dbscan_result$cluster)
mall_db %>%
  filter(cluster_dbscan != 0) %>%   # excluir ruido
  group_by(cluster_dbscan) %>%
  summarise(
    age            = mean(age),
    annual_income  = mean(annual_income),
    spending_score = mean(spending_score),
    n = n()
  )
# A tibble: 2 × 5
  cluster_dbscan   age annual_income spending_score     n
           <dbl> <dbl>         <dbl>          <dbl> <int>
1              1  41.0          51.6           45.3   137
2              2  32.7          82.5           82.8    35
pal_dbscan <- c(
  "0" = "grey60",   # ruido
  "1" = "red",      # cluster 1
  "2" = "blue"      # cluster 2
)
colores_db <- pal_dbscan[as.character(mall_db$cluster_dbscan)]
library(scatterplot3d)

scatterplot3d(mall_db$age,
              mall_db$annual_income,
              mall_db$spending_score,
              color = colores_db,
              pch = 19,
              xlab = "Edad",
              ylab = "Ingreso (k$)",
              zlab = "Spending Score",
              main = "Visualización 3D – DBSCAN (k = 2)")

Con kmeans

mall |>
  mutate(cluster_kmeans=km$cluster) -> mall.k

mall.k |>
  group_by(cluster_kmeans) |>
  summarise(age            = mean(age),
            annual_income   = mean(annual_income),
            spending_score = mean(spending_score))
# A tibble: 2 × 4
  cluster_kmeans   age annual_income spending_score
           <int> <dbl>         <dbl>          <dbl>
1              1  48.7          60.5           32.2
2              2  28.4          60.6           69.3
# Gráfico 3D 
library(scatterplot3d)
colores <- c("red", "blue", "green", "orange", "purple")[mall.k$cluster_kmeans]

scatterplot3d(mall.k$age, 
              mall.k$annual_income, 
              mall.k$spending_score,
              color = colores,
              pch = 19,
              xlab = "Edad",
              ylab = "Ingreso (k$)",
              zlab = "Spending Score",
              main = paste("Visualización 3D - kmeans"))