Descargar Librerías

library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(scales)
## Warning: package 'scales' was built under R version 4.4.3

Leer base de datos

df <- read_excel("C:/Users/rrobl/Downloads/supermarket.xlsx")
## Warning: Expecting numeric in A522063 / R522063C1: got 'A563185'
## Warning: Expecting numeric in A522064 / R522064C1: got 'A563186'
## Warning: Expecting numeric in A522065 / R522065C1: got 'A563187'
str(df)
## tibble [522,064 × 8] (S3: tbl_df/tbl/data.frame)
##  $ BillNo    : num [1:522064] 536365 536365 536365 536365 536365 ...
##  $ Itemname  : chr [1:522064] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity  : num [1:522064] 6 6 8 6 6 2 6 6 6 32 ...
##  $ Date      : POSIXct[1:522064], format: "2010-12-01" "2010-12-01" ...
##  $ Time      : POSIXct[1:522064], format: "1899-12-31 08:26:00" "1899-12-31 08:26:00" ...
##  $ Price     : num [1:522064] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID: num [1:522064] 17850 17850 17850 17850 17850 ...
##  $ Country   : chr [1:522064] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(df)
##      BillNo         Itemname            Quantity       
##  Min.   :536365   Length:522064      Min.   :-9600.00  
##  1st Qu.:547892   Class :character   1st Qu.:    1.00  
##  Median :560603   Mode  :character   Median :    3.00  
##  Mean   :559951                      Mean   :   10.09  
##  3rd Qu.:571892                      3rd Qu.:   10.00  
##  Max.   :581587                      Max.   :80995.00  
##  NA's   :3                                             
##       Date                             Time                       
##  Min.   :2010-12-01 00:00:00.00   Min.   :1899-12-31 06:20:00.00  
##  1st Qu.:2011-03-28 00:00:00.00   1st Qu.:1899-12-31 11:48:00.00  
##  Median :2011-07-20 00:00:00.00   Median :1899-12-31 13:37:00.00  
##  Mean   :2011-07-03 23:15:13.16   Mean   :1899-12-31 13:36:07.61  
##  3rd Qu.:2011-10-19 00:00:00.00   3rd Qu.:1899-12-31 15:30:00.00  
##  Max.   :2011-12-09 00:00:00.00   Max.   :1899-12-31 20:18:00.00  
##                                                                   
##      Price              CustomerID       Country         
##  Min.   :-11062.060   Min.   :12346    Length:522064     
##  1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Median :     2.080   Median :15265    Mode  :character  
##  Mean   :     3.827   Mean   :15317                      
##  3rd Qu.:     4.130   3rd Qu.:16837                      
##  Max.   : 13541.330   Max.   :18287                      
##                       NA's   :134041

Limpiar base de datos

# Eliminar NA
df <- na.omit(df)

# Eliminar cantidades negativas (devoluciones)
df <- df %>% filter(Quantity > 0, Price > 0)

# Crear columna Total por línea
df <- df %>%
  mutate(Total = Quantity * Price)

Construir Variables para Clustering (por cliente)

df_cliente <- df %>%
  group_by(CustomerID) %>%
  summarise(
    Frecuencia = n_distinct(BillNo),
    Gasto = sum(Total)
  )

head(df_cliente)
## # A tibble: 6 × 3
##   CustomerID Frecuencia  Gasto
##        <dbl>      <int>  <dbl>
## 1      12346          1 77184.
## 2      12347          7  4310 
## 3      12349          1  1758.
## 4      12350          1   334.
## 5      12352          8  2506.
## 6      12353          1    89
summary(df_cliente)
##    CustomerID      Frecuencia          Gasto          
##  Min.   :12346   Min.   :  1.000   Min.   :     3.75  
##  1st Qu.:13832   1st Qu.:  1.000   1st Qu.:   306.72  
##  Median :15322   Median :  2.000   Median :   668.84  
##  Mean   :15316   Mean   :  4.227   Mean   :  1993.60  
##  3rd Qu.:16790   3rd Qu.:  5.000   3rd Qu.:  1652.79  
##  Max.   :18287   Max.   :209.000   Max.   :280206.02

Estadística Descriptiva

df_cliente %>%
  summarise(
    Frecuencia_promedio = mean(Frecuencia),
    Gasto_promedio = mean(Gasto),
    Frecuencia_sd = sd(Frecuencia),
    Gasto_sd = sd(Gasto)
  )
## # A tibble: 1 × 4
##   Frecuencia_promedio Gasto_promedio Frecuencia_sd Gasto_sd
##                 <dbl>          <dbl>         <dbl>    <dbl>
## 1                4.23          1994.          7.08    8589.

Escalar Datos

df_scaled <- scale(df_cliente[,c("Frecuencia","Gasto")])

Optimización del Número de Clusters

set.seed(123)

fviz_nbclust(df_scaled, kmeans, method = "wss")

fviz_nbclust(df_scaled, kmeans, method = "silhouette")

Crear 4 Clusters

set.seed(123)

k4 <- kmeans(df_scaled, centers = 4, nstart = 25)

df_cliente$cluster <- as.factor(k4$cluster)

table(df_cliente$cluster)
## 
##    1    2    3    4 
##   19  349    5 3923

Resumen por Cluster

cluster_summary <- df_cliente %>%
  group_by(cluster) %>%
  summarise(
    Frecuencia_prom = mean(Frecuencia),
    Gasto_prom = mean(Gasto),
    Clientes = n()
  )

cluster_summary
## # A tibble: 4 × 4
##   cluster Frecuencia_prom Gasto_prom Clientes
##   <fct>             <dbl>      <dbl>    <int>
## 1 1                 69.9      44816.       19
## 2 2                 16.2       8074.      349
## 3 3                 40.4     205560.        5
## 4 4                  2.80       986.     3923

Nombres Estratégicos a Clusters

df_cliente <- df_cliente %>%
  mutate(
    Nombre_Cluster = case_when(
      cluster == 1 ~ "Premium",
      cluster == 2 ~ "Frecuentes Bajo Gasto",
      cluster == 3 ~ "Ocasionales",
      cluster == 4 ~ "Alto Gasto Baja Frecuencia"
    )
  )

Tabla Clara y con Colores

cluster_summary %>%
  kable("html", caption = "Resumen de Clusters") %>%
  kable_styling(bootstrap_options = c("striped","hover")) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#2C3E50") %>%
  row_spec(1, background = "#AED6F1") %>%
  row_spec(2, background = "#ABEBC6") %>%
  row_spec(3, background = "#F9E79F") %>%
  row_spec(4, background = "#F5B7B1")
Resumen de Clusters
cluster Frecuencia_prom Gasto_prom Clientes
1 69.894737 44816.4768 19
2 16.206304 8074.0334 349
3 40.400000 205560.2280 5
4 2.797094 985.8207 3923

Gráfica Final

frecuencia_prom <- mean(df_cliente$Frecuencia)
gasto_prom <- mean(df_cliente$Gasto)

ggplot(df_cliente, aes(x = Frecuencia, y = Gasto, color = Nombre_Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_vline(xintercept = frecuencia_prom,
             linetype = "dashed",
             color = "black",
             size = 1.2) +
  geom_hline(yintercept = gasto_prom,
             linetype = "dashed",
             color = "black",
             size = 1.2) +
  labs(title = "Segmentación de Clientes",
       x = "Frecuencia de Compra (→ Mayor frecuencia)",
       y = "Gasto Total (↑ Mayor gasto)",
       color = "Cluster") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Características y Recomendaciones por Cluster

Premium

Alta frecuencia Alto gasto

  • Estrategia: Programa VIP, descuentos exclusivos

Frecuentes Bajo Gasto

Alta frecuencia Bajo gasto

  • Estrategia: aumentar ticket promedio con bundles

Ocasionales

Baja frecuencia Bajo gasto

  • Estrategia: campañas de reactivación

Alto Gasto Baja Frecuencia

Baja frecuencia Alto gasto

  • Estrategia: incentivar visitas recurrentes

Conclusión Técnica

El clustering permitió segmentar clientes con base en comportamiento real de compra (frecuencia y gasto), generando segmentos accionables para estrategias de marketing y maximización de rentabilidad.

LS0tDQp0aXRsZTogIkFiYXJyb3RlcyINCmF1dGhvcjogIlJ1YmVuIFJvYmxlcyBBMDA4NDA2OTIiDQpkYXRlOiAiMjAyNi0wMi0yMyINCm91dHB1dDogDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KICAgIHRoZW1lOiB5ZXRpDQotLS0NCg0KPGNlbnRlcj4hW10oaHR0cHM6Ly9pLnBpbmltZy5jb20vb3JpZ2luYWxzLzdlLzUwLzQ5LzdlNTA0OTRjZWI2NjdjMjg1ZjRjOWFmZWJiOTFhNTE3LmdpZik8L2NlbnRlcj4NCg0KIyBbRGVzY2FyZ2FyIExpYnJlcsOtYXNde3N0eWxlPSJjb2xvcjogYmx1ZTsifQ0KDQpgYGB7cn0NCmxpYnJhcnkocmVhZHhsKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoY2x1c3RlcikNCmxpYnJhcnkoZmFjdG9leHRyYSkNCmxpYnJhcnkoa2FibGVFeHRyYSkNCmxpYnJhcnkobHVicmlkYXRlKQ0KbGlicmFyeShzY2FsZXMpDQoNCmBgYA0KDQojIFtMZWVyIGJhc2UgZGUgZGF0b3Nde3N0eWxlPSJjb2xvcjogYmx1ZTsifQ0KDQpgYGB7cn0NCmRmIDwtIHJlYWRfZXhjZWwoIkM6L1VzZXJzL3Jyb2JsL0Rvd25sb2Fkcy9zdXBlcm1hcmtldC54bHN4IikNCg0Kc3RyKGRmKQ0Kc3VtbWFyeShkZikNCg0KDQpgYGANCg0KIyBbTGltcGlhciBiYXNlIGRlIGRhdG9zXXtzdHlsZT0iY29sb3I6IGJsdWU7In0NCg0KYGBge3J9DQojIEVsaW1pbmFyIE5BDQpkZiA8LSBuYS5vbWl0KGRmKQ0KDQojIEVsaW1pbmFyIGNhbnRpZGFkZXMgbmVnYXRpdmFzIChkZXZvbHVjaW9uZXMpDQpkZiA8LSBkZiAlPiUgZmlsdGVyKFF1YW50aXR5ID4gMCwgUHJpY2UgPiAwKQ0KDQojIENyZWFyIGNvbHVtbmEgVG90YWwgcG9yIGzDrW5lYQ0KZGYgPC0gZGYgJT4lDQogIG11dGF0ZShUb3RhbCA9IFF1YW50aXR5ICogUHJpY2UpDQoNCmBgYA0KDQojIFtDb25zdHJ1aXIgVmFyaWFibGVzIHBhcmEgQ2x1c3RlcmluZyAocG9yIGNsaWVudGUpXXtzdHlsZT0iY29sb3I6IGJsdWU7In0NCg0KYGBge3J9DQpkZl9jbGllbnRlIDwtIGRmICU+JQ0KICBncm91cF9ieShDdXN0b21lcklEKSAlPiUNCiAgc3VtbWFyaXNlKA0KICAgIEZyZWN1ZW5jaWEgPSBuX2Rpc3RpbmN0KEJpbGxObyksDQogICAgR2FzdG8gPSBzdW0oVG90YWwpDQogICkNCg0KaGVhZChkZl9jbGllbnRlKQ0Kc3VtbWFyeShkZl9jbGllbnRlKQ0KDQpgYGANCg0KIyBbRXN0YWTDrXN0aWNhIERlc2NyaXB0aXZhXXtzdHlsZT0iY29sb3I6IGJsdWU7In0NCg0KYGBge3J9DQpkZl9jbGllbnRlICU+JQ0KICBzdW1tYXJpc2UoDQogICAgRnJlY3VlbmNpYV9wcm9tZWRpbyA9IG1lYW4oRnJlY3VlbmNpYSksDQogICAgR2FzdG9fcHJvbWVkaW8gPSBtZWFuKEdhc3RvKSwNCiAgICBGcmVjdWVuY2lhX3NkID0gc2QoRnJlY3VlbmNpYSksDQogICAgR2FzdG9fc2QgPSBzZChHYXN0bykNCiAgKQ0KDQpgYGANCg0KIyBbRXNjYWxhciBEYXRvc117c3R5bGU9ImNvbG9yOiBibHVlOyJ9DQoNCmBgYHtyfQ0KZGZfc2NhbGVkIDwtIHNjYWxlKGRmX2NsaWVudGVbLGMoIkZyZWN1ZW5jaWEiLCJHYXN0byIpXSkNCg0KYGBgDQoNCiMgW09wdGltaXphY2nDs24gZGVsIE7Dum1lcm8gZGUgQ2x1c3RlcnNde3N0eWxlPSJjb2xvcjogYmx1ZTsifQ0KDQpgYGB7cn0NCnNldC5zZWVkKDEyMykNCg0KZnZpel9uYmNsdXN0KGRmX3NjYWxlZCwga21lYW5zLCBtZXRob2QgPSAid3NzIikNCmZ2aXpfbmJjbHVzdChkZl9zY2FsZWQsIGttZWFucywgbWV0aG9kID0gInNpbGhvdWV0dGUiKQ0KDQpgYGANCg0KIyBbQ3JlYXIgNCBDbHVzdGVyc117c3R5bGU9ImNvbG9yOiBibHVlOyJ9DQoNCmBgYHtyfQ0Kc2V0LnNlZWQoMTIzKQ0KDQprNCA8LSBrbWVhbnMoZGZfc2NhbGVkLCBjZW50ZXJzID0gNCwgbnN0YXJ0ID0gMjUpDQoNCmRmX2NsaWVudGUkY2x1c3RlciA8LSBhcy5mYWN0b3IoazQkY2x1c3RlcikNCg0KdGFibGUoZGZfY2xpZW50ZSRjbHVzdGVyKQ0KDQpgYGANCg0KIyBbUmVzdW1lbiBwb3IgQ2x1c3Rlcl17c3R5bGU9ImNvbG9yOiBibHVlOyJ9DQoNCmBgYHtyfQ0KY2x1c3Rlcl9zdW1tYXJ5IDwtIGRmX2NsaWVudGUgJT4lDQogIGdyb3VwX2J5KGNsdXN0ZXIpICU+JQ0KICBzdW1tYXJpc2UoDQogICAgRnJlY3VlbmNpYV9wcm9tID0gbWVhbihGcmVjdWVuY2lhKSwNCiAgICBHYXN0b19wcm9tID0gbWVhbihHYXN0byksDQogICAgQ2xpZW50ZXMgPSBuKCkNCiAgKQ0KDQpjbHVzdGVyX3N1bW1hcnkNCg0KYGBgDQoNCiMgW05vbWJyZXMgRXN0cmF0w6lnaWNvcyBhIENsdXN0ZXJzXXtzdHlsZT0iY29sb3I6IGJsdWU7In0NCg0KYGBge3J9DQpkZl9jbGllbnRlIDwtIGRmX2NsaWVudGUgJT4lDQogIG11dGF0ZSgNCiAgICBOb21icmVfQ2x1c3RlciA9IGNhc2Vfd2hlbigNCiAgICAgIGNsdXN0ZXIgPT0gMSB+ICJQcmVtaXVtIiwNCiAgICAgIGNsdXN0ZXIgPT0gMiB+ICJGcmVjdWVudGVzIEJham8gR2FzdG8iLA0KICAgICAgY2x1c3RlciA9PSAzIH4gIk9jYXNpb25hbGVzIiwNCiAgICAgIGNsdXN0ZXIgPT0gNCB+ICJBbHRvIEdhc3RvIEJhamEgRnJlY3VlbmNpYSINCiAgICApDQogICkNCg0KDQpgYGANCg0KIyBbVGFibGEgQ2xhcmEgeSBjb24gQ29sb3Jlc117c3R5bGU9ImNvbG9yOiBibHVlOyJ9DQoNCmBgYHtyfQ0KY2x1c3Rlcl9zdW1tYXJ5ICU+JQ0KICBrYWJsZSgiaHRtbCIsIGNhcHRpb24gPSAiUmVzdW1lbiBkZSBDbHVzdGVycyIpICU+JQ0KICBrYWJsZV9zdHlsaW5nKGJvb3RzdHJhcF9vcHRpb25zID0gYygic3RyaXBlZCIsImhvdmVyIikpICU+JQ0KICByb3dfc3BlYygwLCBib2xkID0gVFJVRSwgY29sb3IgPSAid2hpdGUiLCBiYWNrZ3JvdW5kID0gIiMyQzNFNTAiKSAlPiUNCiAgcm93X3NwZWMoMSwgYmFja2dyb3VuZCA9ICIjQUVENkYxIikgJT4lDQogIHJvd19zcGVjKDIsIGJhY2tncm91bmQgPSAiI0FCRUJDNiIpICU+JQ0KICByb3dfc3BlYygzLCBiYWNrZ3JvdW5kID0gIiNGOUU3OUYiKSAlPiUNCiAgcm93X3NwZWMoNCwgYmFja2dyb3VuZCA9ICIjRjVCN0IxIikNCg0KYGBgDQoNCiMgW0dyw6FmaWNhIEZpbmFsXXtzdHlsZT0iY29sb3I6IGJsdWU7In0NCg0KYGBge3J9DQpmcmVjdWVuY2lhX3Byb20gPC0gbWVhbihkZl9jbGllbnRlJEZyZWN1ZW5jaWEpDQpnYXN0b19wcm9tIDwtIG1lYW4oZGZfY2xpZW50ZSRHYXN0bykNCg0KZ2dwbG90KGRmX2NsaWVudGUsIGFlcyh4ID0gRnJlY3VlbmNpYSwgeSA9IEdhc3RvLCBjb2xvciA9IE5vbWJyZV9DbHVzdGVyKSkgKw0KICBnZW9tX3BvaW50KHNpemUgPSAzLCBhbHBoYSA9IDAuNykgKw0KICBnZW9tX3ZsaW5lKHhpbnRlcmNlcHQgPSBmcmVjdWVuY2lhX3Byb20sDQogICAgICAgICAgICAgbGluZXR5cGUgPSAiZGFzaGVkIiwNCiAgICAgICAgICAgICBjb2xvciA9ICJibGFjayIsDQogICAgICAgICAgICAgc2l6ZSA9IDEuMikgKw0KICBnZW9tX2hsaW5lKHlpbnRlcmNlcHQgPSBnYXN0b19wcm9tLA0KICAgICAgICAgICAgIGxpbmV0eXBlID0gImRhc2hlZCIsDQogICAgICAgICAgICAgY29sb3IgPSAiYmxhY2siLA0KICAgICAgICAgICAgIHNpemUgPSAxLjIpICsNCiAgbGFicyh0aXRsZSA9ICJTZWdtZW50YWNpw7NuIGRlIENsaWVudGVzIiwNCiAgICAgICB4ID0gIkZyZWN1ZW5jaWEgZGUgQ29tcHJhICjihpIgTWF5b3IgZnJlY3VlbmNpYSkiLA0KICAgICAgIHkgPSAiR2FzdG8gVG90YWwgKOKGkSBNYXlvciBnYXN0bykiLA0KICAgICAgIGNvbG9yID0gIkNsdXN0ZXIiKSArDQogIHRoZW1lX21pbmltYWwoKQ0KDQpgYGANCg0KIyBbQ2FyYWN0ZXLDrXN0aWNhcyB5IFJlY29tZW5kYWNpb25lcyBwb3IgQ2x1c3Rlcl17c3R5bGU9ImNvbG9yOiBibHVlOyJ9DQoNCioqUHJlbWl1bSoqDQoNCkFsdGEgZnJlY3VlbmNpYSBBbHRvIGdhc3RvDQoNCi0gICBFc3RyYXRlZ2lhOiBQcm9ncmFtYSBWSVAsIGRlc2N1ZW50b3MgZXhjbHVzaXZvcw0KDQoqKkZyZWN1ZW50ZXMgQmFqbyBHYXN0byoqDQoNCkFsdGEgZnJlY3VlbmNpYSBCYWpvIGdhc3RvDQoNCi0gICBFc3RyYXRlZ2lhOiBhdW1lbnRhciB0aWNrZXQgcHJvbWVkaW8gY29uIGJ1bmRsZXMNCg0KKipPY2FzaW9uYWxlcyoqDQoNCkJhamEgZnJlY3VlbmNpYSBCYWpvIGdhc3RvDQoNCi0gICBFc3RyYXRlZ2lhOiBjYW1wYcOxYXMgZGUgcmVhY3RpdmFjacOzbg0KDQoqKkFsdG8gR2FzdG8gQmFqYSBGcmVjdWVuY2lhKioNCg0KQmFqYSBmcmVjdWVuY2lhIEFsdG8gZ2FzdG8NCg0KLSAgIEVzdHJhdGVnaWE6IGluY2VudGl2YXIgdmlzaXRhcyByZWN1cnJlbnRlcw0KDQojIFtDb25jbHVzacOzbiBUw6ljbmljYV17c3R5bGU9ImNvbG9yOiBibHVlOyJ9DQoNCkVsIGNsdXN0ZXJpbmcgcGVybWl0acOzIHNlZ21lbnRhciBjbGllbnRlcyBjb24gYmFzZSBlbiBjb21wb3J0YW1pZW50byByZWFsIGRlIGNvbXByYSAoZnJlY3VlbmNpYSB5IGdhc3RvKSwgZ2VuZXJhbmRvIHNlZ21lbnRvcyBhY2Npb25hYmxlcyBwYXJhIGVzdHJhdGVnaWFzIGRlIG1hcmtldGluZyB5IG1heGltaXphY2nDs24gZGUgcmVudGFiaWxpZGFkLg0K