# Cargar librerías necesarias
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Fijar semilla para reproducibilidad
set.seed(123)
# Simular los datos
n_customers <- 10000
customer_data <- data.frame(
CustomerID = 1:n_customers,
Age = rnorm(n_customers, mean = 35, sd = 12), # Edad del cliente
Gender = sample(c("Male", "Female"), n_customers, replace = TRUE), # Género
AnnualIncome = rnorm(n_customers, mean = 60000, sd = 15000), # Ingreso anual
Recency = rnorm(n_customers, mean = 40, sd = 15), # Días desde la última compra
Frequency = rpois(n_customers, lambda = 12), # Frecuencia de compras
Monetary = rnorm(n_customers, mean = 500, sd = 300), # Valor promedio de compra
Tenure = rnorm(n_customers, mean = 4, sd = 2), # Años como cliente
Visits = rpois(n_customers, lambda = 8), # Visitas al sitio web
AvgPurchaseInterval = rnorm(n_customers, mean = 25, sd = 10), # Días entre compras
ResponseToCampaigns = rpois(n_customers, lambda = 2), # Respuestas a campañas
CustomerSatisfaction = round(runif(n_customers, min = 1, max = 10), 1), # Satisfacción
Location = sample(c("Urban", "Suburban", "Rural"), n_customers, replace = TRUE) # Ubicación
)
# Ver las primeras filas de los datos simulados
head(customer_data)
## CustomerID Age Gender AnnualIncome Recency Frequency Monetary Tenure
## 1 1 28.27429 Male 39692.27 28.07189 8 972.4347 5.279066
## 2 2 32.23787 Male 51309.34 22.97333 18 213.6055 5.138997
## 3 3 53.70450 Male 47084.34 48.70152 10 418.0091 2.536612
## 4 4 35.84610 Female 74590.18 47.77064 16 796.5744 3.238186
## 5 5 36.55145 Female 69287.19 43.11528 13 694.1195 1.385844
## 6 6 55.58078 Female 80781.69 32.74890 18 793.9590 4.557505
## Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction Location
## 1 8 16.58392 3 6.7 Suburban
## 2 8 18.29595 1 2.2 Urban
## 3 5 25.16863 3 1.9 Suburban
## 4 9 26.51544 3 7.2 Urban
## 5 8 23.42637 1 9.2 Suburban
## 6 8 17.22560 1 9.8 Urban
# Normalizar las variables numéricas
numeric_vars <- customer_data %>% select(Age, AnnualIncome, Recency, Frequency, Monetary, Tenure, Visits, AvgPurchaseInterval, ResponseToCampaigns, CustomerSatisfaction)
customer_data_scaled <- scale(numeric_vars)
# Verificar la estructura de los datos normalizados
head(customer_data_scaled)
## Age AnnualIncome Recency Frequency Monetary Tenure
## [1,] -0.55886588 -1.3283325 -0.8005587 -1.1434074 1.5814716 0.6512785
## [2,] -0.22811679 -0.5582173 -1.1424735 1.7387448 -0.9609106 0.5815418
## [3,] 1.56321124 -0.8382999 0.5828865 -0.5669770 -0.2760764 -0.7141157
## [4,] 0.07297959 0.9851079 0.5204608 1.1623143 0.9922690 -0.3648209
## [5,] 0.13183918 0.6335639 0.2082675 0.2976687 0.6490041 -1.2870521
## [6,] 1.71978138 1.3955534 -0.4869134 1.7387448 0.9835065 0.2920326
## Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction
## [1,] -0.007893637 -0.83528809 0.6968079 0.4866490
## [2,] -0.007893637 -0.66524380 -0.7101742 -1.2465411
## [3,] -1.065077195 0.01737416 0.6968079 -1.3620871
## [4,] 0.344500882 0.15114453 0.6968079 0.6792256
## [5,] -0.007893637 -0.15567325 -0.7101742 1.4495323
## [6,] -0.007893637 -0.77155494 -0.7101742 1.6806243
# Método del codo
fviz_nbclust(customer_data_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 5, linetype = 2) +
labs(subtitle = "El método del codo")
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

# Método del promedio silhouette
fviz_nbclust(customer_data_scaled, kmeans, method = "silhouette") +
labs(subtitle = "El método silhouette")
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

# Aplicar K-means con 5 clusters
set.seed(123)
kmeans_result <- kmeans(customer_data_scaled, centers = 5, nstart = 25)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Añadir los clusters al dataset original
customer_data$Cluster <- kmeans_result$cluster
# Ver los primeros resultados
head(customer_data)
## CustomerID Age Gender AnnualIncome Recency Frequency Monetary Tenure
## 1 1 28.27429 Male 39692.27 28.07189 8 972.4347 5.279066
## 2 2 32.23787 Male 51309.34 22.97333 18 213.6055 5.138997
## 3 3 53.70450 Male 47084.34 48.70152 10 418.0091 2.536612
## 4 4 35.84610 Female 74590.18 47.77064 16 796.5744 3.238186
## 5 5 36.55145 Female 69287.19 43.11528 13 694.1195 1.385844
## 6 6 55.58078 Female 80781.69 32.74890 18 793.9590 4.557505
## Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction Location
## 1 8 16.58392 3 6.7 Suburban
## 2 8 18.29595 1 2.2 Urban
## 3 5 25.16863 3 1.9 Suburban
## 4 9 26.51544 3 7.2 Urban
## 5 8 23.42637 1 9.2 Suburban
## 6 8 17.22560 1 9.8 Urban
## Cluster
## 1 3
## 2 2
## 3 1
## 4 2
## 5 4
## 6 2
# Aplicar K-means con 5 clusters
set.seed(123)
kmeans_result <- kmeans(customer_data_scaled, centers = 5, nstart = 25)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Añadir los clusters al dataset original
customer_data$Cluster <- kmeans_result$cluster
# Ver los primeros resultados
head(customer_data)
## CustomerID Age Gender AnnualIncome Recency Frequency Monetary Tenure
## 1 1 28.27429 Male 39692.27 28.07189 8 972.4347 5.279066
## 2 2 32.23787 Male 51309.34 22.97333 18 213.6055 5.138997
## 3 3 53.70450 Male 47084.34 48.70152 10 418.0091 2.536612
## 4 4 35.84610 Female 74590.18 47.77064 16 796.5744 3.238186
## 5 5 36.55145 Female 69287.19 43.11528 13 694.1195 1.385844
## 6 6 55.58078 Female 80781.69 32.74890 18 793.9590 4.557505
## Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction Location
## 1 8 16.58392 3 6.7 Suburban
## 2 8 18.29595 1 2.2 Urban
## 3 5 25.16863 3 1.9 Suburban
## 4 9 26.51544 3 7.2 Urban
## 5 8 23.42637 1 9.2 Suburban
## 6 8 17.22560 1 9.8 Urban
## Cluster
## 1 3
## 2 2
## 3 1
## 4 2
## 5 4
## 6 2
# Visualizar los clusters
fviz_cluster(kmeans_result, data = customer_data_scaled, geom = "point",
ellipse.type = "norm") +
theme_minimal() +
labs(title = "Visualización de Clusters")

# Visualizar los clusters
fviz_cluster(kmeans_result, data = customer_data_scaled, geom = "point",
ellipse.type = "norm") +
theme_minimal() +
labs(title = "Visualización de Clusters")

# Calcular las medias por cluster
cluster_summary <- customer_data %>%
group_by(Cluster) %>%
summarise_all(list(mean))
## Warning: There were 10 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Gender = (function (x, ...) ...`.
## ℹ In group 1: `Cluster = 1`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 9 remaining warnings.
# Ver la tabla de resumen
print(cluster_summary)
## # A tibble: 5 × 14
## Cluster CustomerID Age Gender AnnualIncome Recency Frequency Monetary Tenure
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 5120. 36.7 NA 58271. 45.3 10.4 434. 4.11
## 2 2 5020. 32.8 NA 62192. 28.8 15.4 618. 4.03
## 3 3 4975. 34.4 NA 58432. 38.5 11.4 471. 4.13
## 4 4 4875. 34.4 NA 61499. 45.7 11.7 425. 4.24
## 5 5 5010. 36.2 NA 58081. 38.7 11.5 586. 3.27
## # ℹ 5 more variables: Visits <dbl>, AvgPurchaseInterval <dbl>,
## # ResponseToCampaigns <dbl>, CustomerSatisfaction <dbl>, Location <dbl>