🔹 1. Librerías
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(NbClust)
library(corrplot)
## corrplot 0.95 loaded
library(DataExplorer)
🔹 2. Carga de datos
data <- iris[,1:4]
data <- scale(data)
data <- as.data.frame(data)
🔹 3. EDA inicial
summary(data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
plot_histogram(data)

corrplot(cor(data), method = "color")

🔹 4. Detección de outliers
boxplot(data, main = "Outliers")

🔹 5. PCA (reducción de dimensionalidad)
pca <- prcomp(data, scale = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
fviz_eig(pca)

fviz_pca_ind(pca)

🔹 6. Selección de número óptimo de clusters
Método Elbow
fviz_nbclust(data, kmeans, method = "wss")

Silhouette
fviz_nbclust(data, kmeans, method = "silhouette")

Gap Statistic
set.seed(123)
gap_stat <- clusGap(data, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

🔹 7. Modelo KMeans
set.seed(123)
kmeans_model <- kmeans(data, centers = 3, nstart = 50)
🔹 8. Resultados
kmeans_model$size
## [1] 50 53 47
kmeans_model$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 2 -0.05005221 -0.88042696 0.3465767 0.2805873
## 3 1.13217737 0.08812645 0.9928284 1.0141287
🔹 9. Asignación de clusters
data$cluster <- as.factor(kmeans_model$cluster)
🔹 10. Visualización clusters
data_numeric <- data %>% select(-cluster)
fviz_cluster(kmeans_model, data = data_numeric)

🔹 11. Análisis por cluster
data %>%
group_by(cluster) %>%
summarise_all(mean)
## # A tibble: 3 × 5
## cluster Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 -1.01 0.850 -1.30 -1.25
## 2 2 -0.0501 -0.880 0.347 0.281
## 3 3 1.13 0.0881 0.993 1.01
🔹 12. Perfilamiento de clusters
data %>%
pivot_longer(-cluster) %>%
group_by(cluster, name) %>%
summarise(mean_value = mean(value)) %>%
ggplot(aes(name, mean_value, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal()
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by cluster and name.
## ℹ Output is grouped by cluster.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(cluster, name))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.

🔹 13. Evaluación Silhouette
sil <- silhouette(kmeans_model$cluster, dist(data[,1:4]))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 50 0.64
## 2 2 53 0.39
## 3 3 47 0.35

🔹 14. Estabilidad del modelo
set.seed(123)
kmeans_model2 <- kmeans(data[,1:4], centers = 3, nstart = 50)
table(kmeans_model$cluster, kmeans_model2$cluster)
##
## 1 2 3
## 1 50 0 0
## 2 0 53 0
## 3 0 0 47
🔹 15. Insights de negocio
cat("Los clusters identificados permiten segmentar entidades con características similares, facilitando estrategias diferenciadas.")
## Los clusters identificados permiten segmentar entidades con características similares, facilitando estrategias diferenciadas.
🔹 16. Recomendaciones
cat("Se recomienda validar estabilidad y aplicar clustering en datasets reales con mayor dimensionalidad.")
## Se recomienda validar estabilidad y aplicar clustering en datasets reales con mayor dimensionalidad.