🔹 1. Librerías

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(cluster)
library(factoextra)

## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

library(NbClust)
library(corrplot)

## corrplot 0.95 loaded

library(DataExplorer)

🔹 2. Carga de datos

data <- iris[,1:4]
data <- scale(data)
data <- as.data.frame(data)

🔹 3. EDA inicial

summary(data)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

plot_histogram(data)

corrplot(cor(data), method = "color")

🔹 4. Detección de outliers

boxplot(data, main = "Outliers")

🔹 5. PCA (reducción de dimensionalidad)

pca <- prcomp(data, scale = TRUE)
summary(pca)

## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion  0.7296 0.9581 0.99482 1.00000

fviz_eig(pca)

fviz_pca_ind(pca)

🔹 6. Selección de número óptimo de clusters

Método Elbow

fviz_nbclust(data, kmeans, method = "wss")

Silhouette

fviz_nbclust(data, kmeans, method = "silhouette")

Gap Statistic

set.seed(123)
gap_stat <- clusGap(data, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

🔹 7. Modelo KMeans

set.seed(123)
kmeans_model <- kmeans(data, centers = 3, nstart = 50)

🔹 8. Resultados

kmeans_model$size

## [1] 50 53 47

kmeans_model$centers

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -1.01119138  0.85041372   -1.3006301  -1.2507035
## 2  -0.05005221 -0.88042696    0.3465767   0.2805873
## 3   1.13217737  0.08812645    0.9928284   1.0141287

🔹 9. Asignación de clusters

data$cluster <- as.factor(kmeans_model$cluster)

🔹 10. Visualización clusters

data_numeric <- data %>% select(-cluster)

fviz_cluster(kmeans_model, data = data_numeric)

🔹 11. Análisis por cluster

data %>%
  group_by(cluster) %>%
  summarise_all(mean)

## # A tibble: 3 × 5
##   cluster Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>          <dbl>       <dbl>        <dbl>       <dbl>
## 1 1            -1.01        0.850        -1.30       -1.25 
## 2 2            -0.0501     -0.880         0.347       0.281
## 3 3             1.13        0.0881        0.993       1.01

🔹 12. Perfilamiento de clusters

data %>%
  pivot_longer(-cluster) %>%
  group_by(cluster, name) %>%
  summarise(mean_value = mean(value)) %>%
  ggplot(aes(name, mean_value, fill = cluster)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal()

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by cluster and name.
## ℹ Output is grouped by cluster.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(cluster, name))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

🔹 13. Evaluación Silhouette

sil <- silhouette(kmeans_model$cluster, dist(data[,1:4]))
fviz_silhouette(sil)

##   cluster size ave.sil.width
## 1       1   50          0.64
## 2       2   53          0.39
## 3       3   47          0.35

🔹 14. Estabilidad del modelo

set.seed(123)
kmeans_model2 <- kmeans(data[,1:4], centers = 3, nstart = 50)

table(kmeans_model$cluster, kmeans_model2$cluster)

##    
##      1  2  3
##   1 50  0  0
##   2  0 53  0
##   3  0  0 47

🔹 15. Insights de negocio

cat("Los clusters identificados permiten segmentar entidades con características similares, facilitando estrategias diferenciadas.")

## Los clusters identificados permiten segmentar entidades con características similares, facilitando estrategias diferenciadas.

🔹 16. Recomendaciones

cat("Se recomienda validar estabilidad y aplicar clustering en datasets reales con mayor dimensionalidad.")

## Se recomienda validar estabilidad y aplicar clustering en datasets reales con mayor dimensionalidad.

Clustering Avanzado - KMeans

Marcelo Callao Pimentel

2025-10-10

🔹 1. Librerías

🔹 2. Carga de datos

🔹 3. EDA inicial

🔹 4. Detección de outliers

🔹 5. PCA (reducción de dimensionalidad)

🔹 6. Selección de número óptimo de clusters

Método Elbow

Silhouette

Gap Statistic

🔹 7. Modelo KMeans

🔹 8. Resultados

🔹 9. Asignación de clusters

🔹 10. Visualización clusters

🔹 11. Análisis por cluster

🔹 12. Perfilamiento de clusters

🔹 13. Evaluación Silhouette

🔹 14. Estabilidad del modelo

🔹 15. Insights de negocio

🔹 16. Recomendaciones