Análisis de Clustering de Clientes

# Cargar librerías necesarias
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# Fijar semilla para reproducibilidad
set.seed(123)

# Simular los datos
n_customers <- 10000
customer_data <- data.frame(
  CustomerID = 1:n_customers,
  Age = rnorm(n_customers, mean = 35, sd = 12),  # Edad del cliente
  Gender = sample(c("Male", "Female"), n_customers, replace = TRUE),  # Género
  AnnualIncome = rnorm(n_customers, mean = 60000, sd = 15000),  # Ingreso anual
  Recency = rnorm(n_customers, mean = 40, sd = 15),  # Días desde la última compra
  Frequency = rpois(n_customers, lambda = 12),  # Frecuencia de compras
  Monetary = rnorm(n_customers, mean = 500, sd = 300),  # Valor promedio de compra
  Tenure = rnorm(n_customers, mean = 4, sd = 2),  # Años como cliente
  Visits = rpois(n_customers, lambda = 8),  # Visitas al sitio web
  AvgPurchaseInterval = rnorm(n_customers, mean = 25, sd = 10),  # Días entre compras
  ResponseToCampaigns = rpois(n_customers, lambda = 2),  # Respuestas a campañas
  CustomerSatisfaction = round(runif(n_customers, min = 1, max = 10), 1),  # Satisfacción
  Location = sample(c("Urban", "Suburban", "Rural"), n_customers, replace = TRUE)  # Ubicación
)

# Ver las primeras filas de los datos simulados
head(customer_data)

##   CustomerID      Age Gender AnnualIncome  Recency Frequency Monetary   Tenure
## 1          1 28.27429   Male     39692.27 28.07189         8 972.4347 5.279066
## 2          2 32.23787   Male     51309.34 22.97333        18 213.6055 5.138997
## 3          3 53.70450   Male     47084.34 48.70152        10 418.0091 2.536612
## 4          4 35.84610 Female     74590.18 47.77064        16 796.5744 3.238186
## 5          5 36.55145 Female     69287.19 43.11528        13 694.1195 1.385844
## 6          6 55.58078 Female     80781.69 32.74890        18 793.9590 4.557505
##   Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction Location
## 1      8            16.58392                   3                  6.7 Suburban
## 2      8            18.29595                   1                  2.2    Urban
## 3      5            25.16863                   3                  1.9 Suburban
## 4      9            26.51544                   3                  7.2    Urban
## 5      8            23.42637                   1                  9.2 Suburban
## 6      8            17.22560                   1                  9.8    Urban

# Normalizar las variables numéricas
numeric_vars <- customer_data %>% select(Age, AnnualIncome, Recency, Frequency, Monetary, Tenure, Visits, AvgPurchaseInterval, ResponseToCampaigns, CustomerSatisfaction)
customer_data_scaled <- scale(numeric_vars)

# Verificar la estructura de los datos normalizados
head(customer_data_scaled)

##              Age AnnualIncome    Recency  Frequency   Monetary     Tenure
## [1,] -0.55886588   -1.3283325 -0.8005587 -1.1434074  1.5814716  0.6512785
## [2,] -0.22811679   -0.5582173 -1.1424735  1.7387448 -0.9609106  0.5815418
## [3,]  1.56321124   -0.8382999  0.5828865 -0.5669770 -0.2760764 -0.7141157
## [4,]  0.07297959    0.9851079  0.5204608  1.1623143  0.9922690 -0.3648209
## [5,]  0.13183918    0.6335639  0.2082675  0.2976687  0.6490041 -1.2870521
## [6,]  1.71978138    1.3955534 -0.4869134  1.7387448  0.9835065  0.2920326
##            Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction
## [1,] -0.007893637         -0.83528809           0.6968079            0.4866490
## [2,] -0.007893637         -0.66524380          -0.7101742           -1.2465411
## [3,] -1.065077195          0.01737416           0.6968079           -1.3620871
## [4,]  0.344500882          0.15114453           0.6968079            0.6792256
## [5,] -0.007893637         -0.15567325          -0.7101742            1.4495323
## [6,] -0.007893637         -0.77155494          -0.7101742            1.6806243

# Método del codo
fviz_nbclust(customer_data_scaled, kmeans, method = "wss") +
  geom_vline(xintercept = 5, linetype = 2) +
  labs(subtitle = "El método del codo")

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)

## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

# Método del promedio silhouette
fviz_nbclust(customer_data_scaled, kmeans, method = "silhouette") +
  labs(subtitle = "El método silhouette")

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

# Aplicar K-means con 5 clusters
set.seed(123)
kmeans_result <- kmeans(customer_data_scaled, centers = 5, nstart = 25)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)

## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

# Añadir los clusters al dataset original
customer_data$Cluster <- kmeans_result$cluster

# Ver los primeros resultados
head(customer_data)

##   CustomerID      Age Gender AnnualIncome  Recency Frequency Monetary   Tenure
## 1          1 28.27429   Male     39692.27 28.07189         8 972.4347 5.279066
## 2          2 32.23787   Male     51309.34 22.97333        18 213.6055 5.138997
## 3          3 53.70450   Male     47084.34 48.70152        10 418.0091 2.536612
## 4          4 35.84610 Female     74590.18 47.77064        16 796.5744 3.238186
## 5          5 36.55145 Female     69287.19 43.11528        13 694.1195 1.385844
## 6          6 55.58078 Female     80781.69 32.74890        18 793.9590 4.557505
##   Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction Location
## 1      8            16.58392                   3                  6.7 Suburban
## 2      8            18.29595                   1                  2.2    Urban
## 3      5            25.16863                   3                  1.9 Suburban
## 4      9            26.51544                   3                  7.2    Urban
## 5      8            23.42637                   1                  9.2 Suburban
## 6      8            17.22560                   1                  9.8    Urban
##   Cluster
## 1       3
## 2       2
## 3       1
## 4       2
## 5       4
## 6       2

# Aplicar K-means con 5 clusters
set.seed(123)
kmeans_result <- kmeans(customer_data_scaled, centers = 5, nstart = 25)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 500000)

## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

# Añadir los clusters al dataset original
customer_data$Cluster <- kmeans_result$cluster

# Ver los primeros resultados
head(customer_data)

##   CustomerID      Age Gender AnnualIncome  Recency Frequency Monetary   Tenure
## 1          1 28.27429   Male     39692.27 28.07189         8 972.4347 5.279066
## 2          2 32.23787   Male     51309.34 22.97333        18 213.6055 5.138997
## 3          3 53.70450   Male     47084.34 48.70152        10 418.0091 2.536612
## 4          4 35.84610 Female     74590.18 47.77064        16 796.5744 3.238186
## 5          5 36.55145 Female     69287.19 43.11528        13 694.1195 1.385844
## 6          6 55.58078 Female     80781.69 32.74890        18 793.9590 4.557505
##   Visits AvgPurchaseInterval ResponseToCampaigns CustomerSatisfaction Location
## 1      8            16.58392                   3                  6.7 Suburban
## 2      8            18.29595                   1                  2.2    Urban
## 3      5            25.16863                   3                  1.9 Suburban
## 4      9            26.51544                   3                  7.2    Urban
## 5      8            23.42637                   1                  9.2 Suburban
## 6      8            17.22560                   1                  9.8    Urban
##   Cluster
## 1       3
## 2       2
## 3       1
## 4       2
## 5       4
## 6       2

# Visualizar los clusters
fviz_cluster(kmeans_result, data = customer_data_scaled, geom = "point",
             ellipse.type = "norm") +
  theme_minimal() +
  labs(title = "Visualización de Clusters")

# Visualizar los clusters
fviz_cluster(kmeans_result, data = customer_data_scaled, geom = "point",
             ellipse.type = "norm") +
  theme_minimal() +
  labs(title = "Visualización de Clusters")

# Calcular las medias por cluster
cluster_summary <- customer_data %>%
  group_by(Cluster) %>%
  summarise_all(list(mean))

## Warning: There were 10 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Gender = (function (x, ...) ...`.
## ℹ In group 1: `Cluster = 1`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 9 remaining warnings.

# Ver la tabla de resumen
print(cluster_summary)

## # A tibble: 5 × 14
##   Cluster CustomerID   Age Gender AnnualIncome Recency Frequency Monetary Tenure
##     <int>      <dbl> <dbl>  <dbl>        <dbl>   <dbl>     <dbl>    <dbl>  <dbl>
## 1       1      5120.  36.7     NA       58271.    45.3      10.4     434.   4.11
## 2       2      5020.  32.8     NA       62192.    28.8      15.4     618.   4.03
## 3       3      4975.  34.4     NA       58432.    38.5      11.4     471.   4.13
## 4       4      4875.  34.4     NA       61499.    45.7      11.7     425.   4.24
## 5       5      5010.  36.2     NA       58081.    38.7      11.5     586.   3.27
## # ℹ 5 more variables: Visits <dbl>, AvgPurchaseInterval <dbl>,
## #   ResponseToCampaigns <dbl>, CustomerSatisfaction <dbl>, Location <dbl>

Análisis de Clustering de Clientes

Edan

2024-08-29