library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
##
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Adjuntando el paquete: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
df <- read_excel("C:/Users/rrobl/Downloads/supermarket.xlsx")
## Warning: Expecting numeric in A522063 / R522063C1: got 'A563185'
## Warning: Expecting numeric in A522064 / R522064C1: got 'A563186'
## Warning: Expecting numeric in A522065 / R522065C1: got 'A563187'
str(df)
## tibble [522,064 × 8] (S3: tbl_df/tbl/data.frame)
## $ BillNo : num [1:522064] 536365 536365 536365 536365 536365 ...
## $ Itemname : chr [1:522064] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:522064] 6 6 8 6 6 2 6 6 6 32 ...
## $ Date : POSIXct[1:522064], format: "2010-12-01" "2010-12-01" ...
## $ Time : POSIXct[1:522064], format: "1899-12-31 08:26:00" "1899-12-31 08:26:00" ...
## $ Price : num [1:522064] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID: num [1:522064] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:522064] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(df)
## BillNo Itemname Quantity
## Min. :536365 Length:522064 Min. :-9600.00
## 1st Qu.:547892 Class :character 1st Qu.: 1.00
## Median :560603 Mode :character Median : 3.00
## Mean :559951 Mean : 10.09
## 3rd Qu.:571892 3rd Qu.: 10.00
## Max. :581587 Max. :80995.00
## NA's :3
## Date Time
## Min. :2010-12-01 00:00:00.00 Min. :1899-12-31 06:20:00.00
## 1st Qu.:2011-03-28 00:00:00.00 1st Qu.:1899-12-31 11:48:00.00
## Median :2011-07-20 00:00:00.00 Median :1899-12-31 13:37:00.00
## Mean :2011-07-03 23:15:13.16 Mean :1899-12-31 13:36:07.61
## 3rd Qu.:2011-10-19 00:00:00.00 3rd Qu.:1899-12-31 15:30:00.00
## Max. :2011-12-09 00:00:00.00 Max. :1899-12-31 20:18:00.00
##
## Price CustomerID Country
## Min. :-11062.060 Min. :12346 Length:522064
## 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
# Eliminar NA
df <- na.omit(df)
# Eliminar cantidades negativas (devoluciones)
df <- df %>% filter(Quantity > 0, Price > 0)
# Crear columna Total por línea
df <- df %>%
mutate(Total = Quantity * Price)
df_cliente <- df %>%
group_by(CustomerID) %>%
summarise(
Frecuencia = n_distinct(BillNo),
Gasto = sum(Total)
)
head(df_cliente)
## # A tibble: 6 × 3
## CustomerID Frecuencia Gasto
## <dbl> <int> <dbl>
## 1 12346 1 77184.
## 2 12347 7 4310
## 3 12349 1 1758.
## 4 12350 1 334.
## 5 12352 8 2506.
## 6 12353 1 89
summary(df_cliente)
## CustomerID Frecuencia Gasto
## Min. :12346 Min. : 1.000 Min. : 3.75
## 1st Qu.:13832 1st Qu.: 1.000 1st Qu.: 306.72
## Median :15322 Median : 2.000 Median : 668.84
## Mean :15316 Mean : 4.227 Mean : 1993.60
## 3rd Qu.:16790 3rd Qu.: 5.000 3rd Qu.: 1652.79
## Max. :18287 Max. :209.000 Max. :280206.02
df_cliente %>%
summarise(
Frecuencia_promedio = mean(Frecuencia),
Gasto_promedio = mean(Gasto),
Frecuencia_sd = sd(Frecuencia),
Gasto_sd = sd(Gasto)
)
## # A tibble: 1 × 4
## Frecuencia_promedio Gasto_promedio Frecuencia_sd Gasto_sd
## <dbl> <dbl> <dbl> <dbl>
## 1 4.23 1994. 7.08 8589.
df_scaled <- scale(df_cliente[,c("Frecuencia","Gasto")])
set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "wss")
fviz_nbclust(df_scaled, kmeans, method = "silhouette")
set.seed(123)
k4 <- kmeans(df_scaled, centers = 4, nstart = 25)
df_cliente$cluster <- as.factor(k4$cluster)
table(df_cliente$cluster)
##
## 1 2 3 4
## 19 349 5 3923
cluster_summary <- df_cliente %>%
group_by(cluster) %>%
summarise(
Frecuencia_prom = mean(Frecuencia),
Gasto_prom = mean(Gasto),
Clientes = n()
)
cluster_summary
## # A tibble: 4 × 4
## cluster Frecuencia_prom Gasto_prom Clientes
## <fct> <dbl> <dbl> <int>
## 1 1 69.9 44816. 19
## 2 2 16.2 8074. 349
## 3 3 40.4 205560. 5
## 4 4 2.80 986. 3923
df_cliente <- df_cliente %>%
mutate(
Nombre_Cluster = case_when(
cluster == 1 ~ "Premium",
cluster == 2 ~ "Frecuentes Bajo Gasto",
cluster == 3 ~ "Ocasionales",
cluster == 4 ~ "Alto Gasto Baja Frecuencia"
)
)
cluster_summary %>%
kable("html", caption = "Resumen de Clusters") %>%
kable_styling(bootstrap_options = c("striped","hover")) %>%
row_spec(0, bold = TRUE, color = "white", background = "#2C3E50") %>%
row_spec(1, background = "#AED6F1") %>%
row_spec(2, background = "#ABEBC6") %>%
row_spec(3, background = "#F9E79F") %>%
row_spec(4, background = "#F5B7B1")
| cluster | Frecuencia_prom | Gasto_prom | Clientes |
|---|---|---|---|
| 1 | 69.894737 | 44816.4768 | 19 |
| 2 | 16.206304 | 8074.0334 | 349 |
| 3 | 40.400000 | 205560.2280 | 5 |
| 4 | 2.797094 | 985.8207 | 3923 |
frecuencia_prom <- mean(df_cliente$Frecuencia)
gasto_prom <- mean(df_cliente$Gasto)
ggplot(df_cliente, aes(x = Frecuencia, y = Gasto, color = Nombre_Cluster)) +
geom_point(size = 3, alpha = 0.7) +
geom_vline(xintercept = frecuencia_prom,
linetype = "dashed",
color = "black",
size = 1.2) +
geom_hline(yintercept = gasto_prom,
linetype = "dashed",
color = "black",
size = 1.2) +
labs(title = "Segmentación de Clientes",
x = "Frecuencia de Compra (→ Mayor frecuencia)",
y = "Gasto Total (↑ Mayor gasto)",
color = "Cluster") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Premium
Alta frecuencia Alto gasto
Frecuentes Bajo Gasto
Alta frecuencia Bajo gasto
Ocasionales
Baja frecuencia Bajo gasto
Alto Gasto Baja Frecuencia
Baja frecuencia Alto gasto
El clustering permitió segmentar clientes con base en comportamiento real de compra (frecuencia y gasto), generando segmentos accionables para estrategias de marketing y maximización de rentabilidad.