library(pacman)Warning: package 'pacman' was built under R version 4.5.2
p_load(rio, cluster, factoextra, tidyverse, ggrepel, scatterplot3d,
tictoc, NbClust, dbscan, colorspace)
options(scipen = 999)
options(digits = 3)CLUSTERING - MALL CUSTOMERS DATASET
library(pacman)Warning: package 'pacman' was built under R version 4.5.2
p_load(rio, cluster, factoextra, tidyverse, ggrepel, scatterplot3d,
tictoc, NbClust, dbscan, colorspace)
options(scipen = 999)
options(digits = 3)rm(list = ls())mall <- read.csv("Mall_Customers.csv")# dimensión
print(dim(mall))[1] 200 5
# estructura
str(mall)'data.frame': 200 obs. of 5 variables:
$ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
$ Genre : chr "Male" "Male" "Female" "Female" ...
$ Age : int 19 21 20 23 31 22 35 23 64 30 ...
$ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
$ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...
# resumen
summary(mall) CustomerID Genre Age Annual.Income..k..
Min. : 1.0 Length:200 Min. :18.0 Min. : 15.0
1st Qu.: 50.8 Class :character 1st Qu.:28.8 1st Qu.: 41.5
Median :100.5 Mode :character Median :36.0 Median : 61.5
Mean :100.5 Mean :38.9 Mean : 60.6
3rd Qu.:150.2 3rd Qu.:49.0 3rd Qu.: 78.0
Max. :200.0 Max. :70.0 Max. :137.0
Spending.Score..1.100.
Min. : 1.0
1st Qu.:34.8
Median :50.0
Mean :50.2
3rd Qu.:73.0
Max. :99.0
# missings
colSums(is.na(mall)) CustomerID Genre Age
0 0 0
Annual.Income..k.. Spending.Score..1.100.
0 0
# Seleccionar variables numéricas para clustering
subdata <- mall %>%
select(Age, Annual.Income..k.., Spending.Score..1.100.)
# Renombrar columnas
colnames(subdata) <- c("Age", "Income", "Spending")
# Estandarizar las variables
subdata <- as.data.frame(scale(subdata))
head(subdata) Age Income Spending
1 -1.421 -1.73 -0.434
2 -1.278 -1.73 1.193
3 -1.349 -1.70 -1.712
4 -1.135 -1.70 1.038
5 -0.562 -1.66 -0.395
6 -1.206 -1.66 0.999
summary(subdata) Age Income Spending
Min. :-1.493 Min. :-1.7346 Min. :-1.90524
1st Qu.:-0.723 1st Qu.:-0.7257 1st Qu.:-0.59829
Median :-0.204 Median : 0.0358 Median :-0.00774
Mean : 0.000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.727 3rd Qu.: 0.6640 3rd Qu.: 0.88292
Max. : 2.230 Max. : 2.9104 Max. : 1.88975
rownames(mall) <- mall$CustomerIDnames(mall)[1] "CustomerID" "Genre" "Age"
[4] "Annual.Income..k.." "Spending.Score..1.100."
# renombrar
names(mall)[names(mall) == "Annual.Income..k.."] <- "annual_income"
names(mall)[names(mall) == "Spending.Score..1.100."] <- "spending_score"
names(mall) <- tolower(names(mall))# subset
mall_subset <- mall
mall_subset <- subset(mall_subset, select = -customerid)# one-hot encoding
mall_subset <- cbind(mall_subset,
model.matrix(~ genre - 1 ,
data = mall_subset
))names(mall_subset) <- tolower(names(mall_subset))
# drop genre
mall_subset <- subset(mall_subset, select = -genre)
# drop gernemale por multicolinialidad
mall_subset <- subset(mall_subset, select = -genremale)
# solo numeric
mall_subset_numeric <- subset(mall_subset, select = -genrefemale)# estandarización
mall_subset_numeric <- scale(mall_subset_numeric)Usando solo variables continuas: mall_subset_numeric
tic()
set.seed(20202209)
fviz_nbclust(mall_subset_numeric, kmeans, method = "wss", k.max = 10) +
geom_vline(xintercept = 5, linetype = 2, col = "red") +
labs(subtitle = "Método Elbow") +
theme_bw()toc()0.37 sec elapsed
tic()
set.seed(20202209)
fviz_nbclust(mall_subset_numeric, kmeans,
method = "silhouette", k.max = 15) +
labs(subtitle = "Silhouette method") +
theme_bw()toc()0.28 sec elapsed
set.seed(20202209)
res.nbclust <- NbClust(mall_subset_numeric, distance = "euclidean",
min.nc = 2, max.nc = 15,
method = "kmeans", index = "all")*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 4 proposed 2 as the best number of clusters
* 3 proposed 3 as the best number of clusters
* 2 proposed 4 as the best number of clusters
* 4 proposed 5 as the best number of clusters
* 4 proposed 6 as the best number of clusters
* 1 proposed 9 as the best number of clusters
* 1 proposed 10 as the best number of clusters
* 2 proposed 12 as the best number of clusters
* 1 proposed 13 as the best number of clusters
* 1 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
k_optimo <- 2set.seed(20202209)
tic()
km <- kmeans(mall_subset_numeric,
centers = k_optimo,
iter.max = 100,
nstart = 25,
algorithm = "Lloyd")
toc()0 sec elapsed
# Explorar el cluster creado
cat("\nTamaños de clusters:\n")
Tamaños de clusters:
print(km$size)[1] 103 97
cat("\nProporción de observaciones:\n")
Proporción de observaciones:
print(prop.table(km$size))[1] 0.515 0.485
cat("\nCentros de clusters:\n")
Centros de clusters:
print(km$centers) age annual_income spending_score
1 0.707 -0.00247 -0.698
2 -0.751 0.00262 0.741
# Visualización K-means
fviz_cluster(km, data = mall_subset_numeric, ellipse.type = "convex") +
theme_classic() +
labs(title = "K-means Clustering (k=5)")# Validación usando índice de Silueta
km_clusters <- eclust(x = mall_subset_numeric, FUNcluster = "kmeans",
k = k_optimo, seed = 20202209,
hc_metric = "euclidean",
nstart = 25,
graph = FALSE)fviz_silhouette(sil.obj = km_clusters,
print.summary = TRUE,
palette = "jco",
ggtheme = theme_classic()) +
labs(title = "Gráfico de Silueta - K-means")Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
Please report the issue at <https://github.com/kassambara/factoextra/issues>.
cluster size ave.sil.width
1 1 103 0.30
2 2 97 0.37
# Guardar ancho promedio de silueta
kmeans_avg_sil <- km_clusters$silinfo$avg.width
cat("\nAncho promedio de silueta K-means:", round(kmeans_avg_sil, 4), "\n")
Ancho promedio de silueta K-means: 0.336
# Calcular matriz de distancias
distancias <- daisy(mall_subset_numeric, metric = "euclidean")
# Aplicar clustering jerárquico
set.seed(20202209)
aglomerativo <- hcut(distancias,
k = k_optimo,
hc_func = 'agnes',
hc_method = "ward.D2")# Visualización Jerárquico
fviz_cluster(object = list(data = mall_subset_numeric, cluster = aglomerativo$cluster),
ellipse.type = "convex",
geom = "point") +
theme_classic() +
labs(title = "Clustering Jerárquico Ward (k=2)")# Validación con silueta
hclust_sil <- silhouette(aglomerativo$cluster, distancias)
hclust_avg_sil <- mean(hclust_sil[, 3])
cat("Tamaños de clusters Jerárquico:\n")Tamaños de clusters Jerárquico:
print(table(aglomerativo$cluster))
1 2
95 105
fviz_silhouette(hclust_sil,
print.summary = TRUE,
palette = "jco",
ggtheme = theme_classic()) +
labs(title = "Gráfico de Silueta - Jerárquico") cluster size ave.sil.width
1 1 95 0.30
2 2 105 0.34
# determinando epsilon
dbscan::kNNdistplot(mall_subset_numeric, k = 2)
abline(h = 0.6, lty = 2, col = "red")
title("k-NN Distance Plot")eps_optimo <- 0.6# regla de dimension n = 3
minpts_optimo <- 5set.seed(20202209)
tic()
dbscan_result <- fpc::dbscan(data = mall_subset_numeric,
eps = eps_optimo,
MinPts = minpts_optimo)
toc()0.58 sec elapsed
# número de clusters (excluyendo ruido)
n_clusters_dbscan <- max(dbscan_result$cluster)
n_ruido <- sum(dbscan_result$cluster == 0)
cat("\nNúmero de clusters encontrados:", n_clusters_dbscan, "\n")
Número de clusters encontrados: 2
cat("Número de puntos clasificados como ruido:", n_ruido,
"(", round(n_ruido/nrow(mall_subset_numeric)*100, 1), "%)\n\n")Número de puntos clasificados como ruido: 28 ( 14 %)
# Visualización DBSCAN
fviz_cluster(dbscan_result,
mall_subset_numeric,
stand = FALSE,
ellipse = FALSE,
geom = "point",
palette = "jco") +
labs(title = "DBSCAN Clustering",
subtitle = paste("eps =", eps_optimo, ", minPts =", minpts_optimo)) +
theme_classic()library(gridExtra)Warning: package 'gridExtra' was built under R version 4.5.2
Adjuntando el paquete: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
p1 <- fviz_cluster(km, data = mall_subset_numeric, ellipse.type = "convex",
geom = "point", palette = "jco") +
theme_minimal() +
labs(title = "K-means",
subtitle = paste("Silueta:", round(kmeans_avg_sil, 3)))
p2 <- fviz_cluster(list(data = mall_subset_numeric, cluster = aglomerativo$cluster),
ellipse.type = "convex", geom = "point",
palette = "jco") +
theme_minimal() +
labs(title = "Jerárquico Ward",
subtitle = paste("Silueta:", round(hclust_avg_sil, 3)))
p3 <- fviz_cluster(dbscan_result, data = mall_subset_numeric,
geom = "point", palette = "jco",
stand = FALSE, ellipse = FALSE) +
theme_minimal() +
labs(title = "DBSCAN",
subtitle = paste("Clusters:", n_clusters_dbscan, "| Ruido:", n_ruido))
grid.arrange(p1, p2, p3, ncol = 3)mall_db <- mall %>%
mutate(cluster_dbscan = dbscan_result$cluster)mall_db %>%
filter(cluster_dbscan != 0) %>% # excluir ruido
group_by(cluster_dbscan) %>%
summarise(
age = mean(age),
annual_income = mean(annual_income),
spending_score = mean(spending_score),
n = n()
)# A tibble: 2 × 5
cluster_dbscan age annual_income spending_score n
<dbl> <dbl> <dbl> <dbl> <int>
1 1 41.0 51.6 45.3 137
2 2 32.7 82.5 82.8 35
pal_dbscan <- c(
"0" = "grey60", # ruido
"1" = "red", # cluster 1
"2" = "blue" # cluster 2
)
colores_db <- pal_dbscan[as.character(mall_db$cluster_dbscan)]library(scatterplot3d)
scatterplot3d(mall_db$age,
mall_db$annual_income,
mall_db$spending_score,
color = colores_db,
pch = 19,
xlab = "Edad",
ylab = "Ingreso (k$)",
zlab = "Spending Score",
main = "Visualización 3D – DBSCAN (k = 2)")Con kmeans
mall |>
mutate(cluster_kmeans=km$cluster) -> mall.k
mall.k |>
group_by(cluster_kmeans) |>
summarise(age = mean(age),
annual_income = mean(annual_income),
spending_score = mean(spending_score))# A tibble: 2 × 4
cluster_kmeans age annual_income spending_score
<int> <dbl> <dbl> <dbl>
1 1 48.7 60.5 32.2
2 2 28.4 60.6 69.3
# Gráfico 3D
library(scatterplot3d)
colores <- c("red", "blue", "green", "orange", "purple")[mall.k$cluster_kmeans]
scatterplot3d(mall.k$age,
mall.k$annual_income,
mall.k$spending_score,
color = colores,
pch = 19,
xlab = "Edad",
ylab = "Ingreso (k$)",
zlab = "Spending Score",
main = paste("Visualización 3D - kmeans"))