library(readxl)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
ruta <- "/Users/nataliamartinez/Desktop/supermarket.xlsx"
# Leer hoja "supermercado"
df <- read_excel(ruta, sheet = "supermercado")
## Warning: Expecting numeric in A522063 / R522063C1: got 'A563185'
## Warning: Expecting numeric in A522064 / R522064C1: got 'A563186'
## Warning: Expecting numeric in A522065 / R522065C1: got 'A563187'
df <- as.data.table(df)
head(df)
## BillNo Itemname Quantity Date
## <num> <char> <num> <POSc>
## 1: 536365 WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01
## 2: 536365 WHITE METAL LANTERN 6 2010-12-01
## 3: 536365 CREAM CUPID HEARTS COAT HANGER 8 2010-12-01
## 4: 536365 KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01
## 5: 536365 RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01
## 6: 536365 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01
## Time Price CustomerID Country
## <POSc> <num> <num> <char>
## 1: 1899-12-31 08:26:00 2.55 17850 United Kingdom
## 2: 1899-12-31 08:26:00 3.39 17850 United Kingdom
## 3: 1899-12-31 08:26:00 2.75 17850 United Kingdom
## 4: 1899-12-31 08:26:00 3.39 17850 United Kingdom
## 5: 1899-12-31 08:26:00 3.39 17850 United Kingdom
## 6: 1899-12-31 08:26:00 7.65 17850 United Kingdom
summary(df)
## BillNo Itemname Quantity
## Min. :536365 Length:522064 Min. :-9600.00
## 1st Qu.:547892 Class :character 1st Qu.: 1.00
## Median :560603 Mode :character Median : 3.00
## Mean :559951 Mean : 10.09
## 3rd Qu.:571892 3rd Qu.: 10.00
## Max. :581587 Max. :80995.00
## NA's :3
## Date Time
## Min. :2010-12-01 00:00:00.00 Min. :1899-12-31 06:20:00.00
## 1st Qu.:2011-03-28 00:00:00.00 1st Qu.:1899-12-31 11:48:00.00
## Median :2011-07-20 00:00:00.00 Median :1899-12-31 13:37:00.00
## Mean :2011-07-03 23:15:13.16 Mean :1899-12-31 13:36:07.61
## 3rd Qu.:2011-10-19 00:00:00.00 3rd Qu.:1899-12-31 15:30:00.00
## Max. :2011-12-09 00:00:00.00 Max. :1899-12-31 20:18:00.00
##
## Price CustomerID Country
## Min. :-11062.060 Min. :12346 Length:522064
## 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
str(df)
## Classes 'data.table' and 'data.frame': 522064 obs. of 8 variables:
## $ BillNo : num 536365 536365 536365 536365 536365 ...
## $ Itemname : chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num 6 6 8 6 6 2 6 6 6 32 ...
## $ Date : POSIXct, format: "2010-12-01" "2010-12-01" ...
## $ Time : POSIXct, format: "1899-12-31 08:26:00" "1899-12-31 08:26:00" ...
## $ Price : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID: num 17850 17850 17850 17850 17850 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Revisión rápida de nulos por columna
sapply(df, function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Time Price CustomerID
## 3 1455 0 0 0 0 134041
## Country
## 0
df_clean <- df %>%
filter(!is.na(CustomerID)) %>%
filter(Quantity > 0, Price > 0) %>%
mutate(
CustomerID = as.character(CustomerID),
Date = as.Date(Date),
Total = Quantity * Price
)
nrow(df); nrow(df_clean)
## [1] 522064
## [1] 387985
# 4.1 Limpieza básica de transacciones
df_clean <- df %>%
filter(!is.na(CustomerID)) %>%
filter(Quantity > 0, Price > 0) %>%
mutate(
CustomerID = as.character(CustomerID),
Date = as.Date(Date),
Total = Quantity * Price
)
# 4.2 Total por ticket (factura) por cliente
tickets <- df_clean %>%
group_by(CustomerID, BillNo) %>%
summarise(
ticket_total = sum(Total, na.rm = TRUE),
.groups = "drop"
)
# 4.3 Variables por cliente: X=Frecuencia, Y=Ticket promedio
clientes_xy <- tickets %>%
group_by(CustomerID) %>%
summarise(
Frecuencia = n_distinct(BillNo),
Ticket_Promedio = mean(ticket_total, na.rm = TRUE),
.groups = "drop"
)
# 4.4 Quitar outliers con IQR (en X y Y)
remove_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
x >= (Q1 - 1.5 * IQR_val) & x <= (Q3 + 1.5 * IQR_val)
}
clientes_xy_clean <- clientes_xy %>%
filter(
remove_outliers(Frecuencia),
remove_outliers(Ticket_Promedio)
)
cat("Clientes antes:", nrow(clientes_xy), "\n")
## Clientes antes: 4296
cat("Clientes después (sin outliers):", nrow(clientes_xy_clean), "\n")
## Clientes después (sin outliers): 3768
head(clientes_xy_clean)
## # A tibble: 6 × 3
## CustomerID Frecuencia Ticket_Promedio
## <chr> <int> <dbl>
## 1 12347 7 616.
## 2 12350 1 334.
## 3 12352 8 313.
## 4 12353 1 89
## 5 12355 1 459.
## 6 12358 2 584.
xy_scaled <- scale(clientes_xy_clean[, c("Frecuencia", "Ticket_Promedio")])
xy_scaled <- as.matrix(xy_scaled)
storage.mode(xy_scaled) <- "double"
##6.1 Gap Statistic (optimización)
set.seed(123)
optimizacion <- clusGap(
xy_scaled,
FUN = kmeans,
nstart = 25,
K.max = 10,
B = 50
)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
plot(optimizacion, xlab = "Número de clusters k", main = "Optimización (Gap Statistic)")
optimizacion$Tab
## logW E.logW gap SE.sim
## [1,] 7.386883 7.985122 0.5982387 0.005178882
## [2,] 7.112517 7.708169 0.5956522 0.006016135
## [3,] 6.874417 7.491469 0.6170521 0.004938352
## [4,] 6.723127 7.290705 0.5675780 0.005568678
## [5,] 6.597957 7.195403 0.5974462 0.005481583
## [6,] 6.507203 7.106547 0.5993444 0.005500824
## [7,] 6.431542 7.025572 0.5940306 0.005246450
## [8,] 6.369831 6.948125 0.5782942 0.005128150
## [9,] 6.305740 6.880772 0.5750320 0.006267827
## [10,] 6.236537 6.830631 0.5940941 0.005177314
set.seed(123)
k <- 3
clusters <- kmeans(xy_scaled, centers = k, nstart = 25)
clusters
## K-means clustering with 3 clusters of sizes 1002, 2089, 677
##
## Cluster means:
## Frecuencia Ticket_Promedio
## 1 -0.2829982 1.2046814
## 2 -0.4369689 -0.6160310
## 3 1.7671968 0.1178698
##
## Clustering vector:
## [1] 3 2 3 2 1 1 2 3 2 2 2 1 1 1 1 3 3 2 2 3 1 1 1 2 2 2 3 1 1 2 2 3 1 2 2 3 1
## [38] 1 2 1 2 1 1 2 2 2 1 1 2 2 1 2 3 1 1 1 1 3 1 1 1 1 3 1 1 3 3 1 1 2 3 1 2 2
## [75] 3 2 3 1 2 2 1 2 2 1 2 1 1 1 1 3 2 1 1 2 3 3 1 2 3 1 2 1 1 1 2 1 1 1 2 2 2
## [112] 2 3 2 2 2 1 2 3 1 2 1 2 1 2 2 3 2 1 1 2 3 2 2 1 2 2 3 2 3 3 3 2 2 1 2 1 3
## [149] 1 1 2 1 1 1 2 3 1 2 2 2 3 3 2 1 3 1 1 1 2 1 3 1 1 1 2 2 2 2 2 1 1 2 2 3 2
## [186] 1 2 2 1 3 2 2 3 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 1 1 1 2 2 3 2 2 2 1 3 3 2
## [223] 1 2 1 2 1 2 2 1 1 2 3 2 2 1 2 3 3 3 1 3 2 2 2 2 2 2 1 2 1 1 2 1 2 1 2 2 2
## [260] 1 2 2 2 2 1 3 1 3 2 3 2 2 2 1 2 1 2 2 1 1 3 2 2 1 2 2 3 1 2 3 1 1 2 1 1 3
## [297] 3 2 2 1 2 1 2 2 1 2 2 1 1 2 1 2 2 2 2 2 2 1 1 1 3 3 1 2 1 1 3 2 1 2 2 1 3
## [334] 2 2 2 3 2 1 2 1 2 2 1 3 3 3 1 3 1 2 3 2 3 2 3 1 2 2 2 2 1 2 2 1 2 1 2 1 2
## [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 2 2 3 1 3 2 1 1 1 2 2 1 2 2 2 1 1 2 2 3 1
## [408] 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 3 2 1 1 1 3 1 1 3
## [445] 3 2 2 1 3 2 2 1 2 2 1 1 3 3 1 2 1 2 2 2 2 2 1 1 2 1 2 1 2 2 1 1 1 2 2 1 2
## [482] 2 2 1 2 3 2 3 3 2 1 2 2 2 1 2 2 1 1 2 2 2 3 3 1 1 2 3 2 2 2 2 1 2 2 2 1 2
## [519] 3 2 1 2 2 2 2 1 2 1 3 2 2 2 1 2 2 2 1 1 2 3 3 1 2 2 2 2 2 1 2 2 1 2 2 1 1
## [556] 2 1 2 2 1 1 2 1 2 2 2 3 2 1 3 1 2 3 2 3 2 1 2 2 2 3 1 3 1 1 1 1 1 1 1 1 2
## [593] 2 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 1 2 3 1 2 2 1 2 3 2 1 3 1 2 2
## [630] 2 2 3 2 1 1 2 2 2 1 3 2 2 2 2 1 2 3 2 3 2 2 1 1 2 3 1 2 2 1 3 2 1 2 1 2 1
## [667] 2 1 2 2 1 1 3 2 3 2 2 2 1 1 2 2 3 1 2 2 3 2 2 3 1 2 2 2 2 1 2 3 1 2 2 2 2
## [704] 1 1 1 2 1 1 2 1 2 2 3 2 2 2 2 3 3 2 2 2 2 3 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2
## [741] 1 2 2 1 2 1 3 2 3 1 2 3 1 1 2 2 1 1 2 2 1 3 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3
## [778] 2 2 1 2 1 2 1 2 3 2 1 3 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 3 2 2 1 2 2 3
## [815] 2 3 2 2 1 2 2 2 2 3 2 3 2 3 2 1 1 2 3 1 2 3 1 2 1 1 1 2 2 1 2 3 1 2 2 3 3
## [852] 2 3 1 2 3 3 2 2 1 2 1 3 1 1 2 2 2 2 3 3 2 2 1 1 2 1 1 2 2 3 1 2 2 3 1 2 2
## [889] 1 1 2 2 1 2 2 1 3 2 2 2 3 2 1 2 2 2 2 3 1 2 2 1 1 2 2 2 3 3 2 2 2 1 1 3 2
## [926] 1 3 2 2 3 1 2 2 2 1 2 1 3 2 2 1 2 2 2 2 1 2 1 3 2 2 1 2 1 3 2 2 1 2 1 2 1
## [963] 2 2 1 2 2 2 2 2 2 2 3 2 2 3 2 2 3 2 2 2 2 2 3 2 2 1 3 1 2 2 1 3 2 1 1 2 3
## [1000] 2 3 3 3 1 1 2 3 1 2 1 2 2 1 1 2 2 1 2 2 2 2 3 3 2 3 1 2 1 2 2 2 1 2 3 1 1
## [1037] 2 2 2 1 2 3 2 2 2 3 2 2 2 2 3 1 2 1 2 3 2 2 2 1 2 3 2 3 2 3 2 2 2 3 1 2 3
## [1074] 2 1 1 2 2 2 3 1 3 2 1 1 1 2 2 2 2 2 1 2 2 1 2 2 1 3 1 2 1 2 2 1 2 2 2 2 3
## [1111] 1 1 2 3 1 1 2 2 2 3 2 2 2 2 3 3 3 3 3 1 2 2 2 3 1 2 2 1 3 1 2 3 1 2 1 2 2
## [1148] 2 3 1 2 2 3 1 2 2 1 1 1 1 2 1 3 2 2 1 1 2 2 2 1 2 2 1 2 1 3 1 3 3 2 2 2 3
## [1185] 2 1 1 1 3 3 2 3 2 3 1 2 3 2 2 2 1 2 2 1 3 2 1 2 2 2 3 2 3 2 1 1 2 2 2 2 2
## [1222] 2 2 1 2 2 1 2 3 2 2 2 2 2 2 2 1 1 2 2 1 3 1 3 2 3 2 1 1 3 2 3 3 3 3 1 2 3
## [1259] 2 1 3 1 1 1 2 2 3 2 3 1 3 3 2 2 2 1 1 1 2 3 2 2 2 2 2 2 3 2 2 1 3 2 1 3 2
## [1296] 2 3 2 1 2 2 2 2 2 2 3 2 2 2 3 1 2 1 2 2 2 1 3 2 2 2 3 1 2 3 3 3 2 3 2 2 1
## [1333] 3 2 2 2 2 2 3 3 1 2 3 2 3 2 1 2 2 2 3 2 1 3 2 2 1 2 2 3 1 2 1 1 1 1 2 2 3
## [1370] 3 2 2 2 2 2 3 2 2 2 3 2 1 2 2 2 3 1 2 1 2 1 1 2 2 2 1 2 3 2 3 2 1 1 2 1 2
## [1407] 2 2 2 1 3 3 2 2 1 1 2 2 1 2 2 3 1 2 2 2 3 2 2 2 2 3 2 1 2 1 3 1 3 2 2 3 1
## [1444] 2 2 3 2 2 3 2 3 2 1 3 2 3 2 1 2 3 2 3 2 2 2 3 2 2 3 1 2 2 3 2 3 1 3 3 1 3
## [1481] 3 3 2 3 1 1 2 3 1 1 1 1 2 2 2 2 2 3 2 1 2 1 3 1 2 2 2 2 2 2 2 2 2 2 3 2 2
## [1518] 2 2 2 1 2 2 3 3 2 1 2 1 2 2 2 2 2 2 1 2 2 3 2 1 2 3 1 1 2 2 2 3 2 3 2 1 1
## [1555] 1 2 2 1 2 2 2 3 1 2 2 1 1 3 2 2 2 3 1 1 1 2 2 2 1 1 2 1 3 3 2 1 1 2 3 2 2
## [1592] 1 2 2 2 1 1 1 2 2 2 2 3 1 2 3 2 2 2 2 2 2 3 2 3 2 2 2 1 2 1 1 2 1 2 2 3 3
## [1629] 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 3 2 3 3 1 2 2 2 2 2 3 3 2 2 3 2 2
## [1666] 1 1 2 2 2 1 1 2 2 2 1 3 3 2 2 1 1 1 2 2 1 1 1 1 2 2 2 2 1 3 1 1 2 1 1 2 2
## [1703] 1 2 2 1 1 3 3 3 1 2 1 3 2 1 3 2 2 1 2 1 2 1 1 3 2 3 2 3 2 2 2 3 1 2 2 1 1
## [1740] 2 1 3 2 2 2 1 1 2 2 2 1 2 2 3 2 2 2 1 2 2 2 2 2 2 1 1 1 2 2 3 3 2 2 2 3 2
## [1777] 1 2 2 3 2 2 3 2 3 1 1 1 2 1 2 3 2 2 2 2 1 1 1 3 3 2 2 2 1 1 3 2 2 2 2 3 2
## [1814] 2 2 1 1 2 3 1 3 1 1 1 2 2 2 2 3 2 2 1 2 2 1 2 3 1 1 3 1 2 2 2 2 2 2 2 2 2
## [1851] 2 1 2 2 2 1 1 1 2 2 2 1 1 2 1 2 2 2 2 2 3 1 2 3 1 1 2 1 3 2 1 1 3 2 1 3 3
## [1888] 2 2 2 1 1 1 2 1 2 2 2 1 2 2 2 1 2 2 2 3 1 2 2 1 1 2 2 2 2 2 3 1 2 2 2 2 3
## [1925] 2 1 2 2 2 1 3 1 2 2 1 2 1 2 3 2 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 2 2 1 3 2 3
## [1962] 1 1 2 2 2 1 2 1 3 2 3 2 3 2 2 3 2 3 2 1 1 2 2 3 2 3 1 1 2 2 2 1 2 2 2 3 2
## [1999] 1 2 1 2 1 2 2 1 2 2 1 2 2 2 3 2 3 2 2 2 2 2 2 2 1 2 2 1 2 3 3 2 2 1 3 2 2
## [2036] 3 3 2 2 2 2 2 2 2 1 2 3 2 2 1 2 3 1 3 2 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 2 2
## [2073] 3 2 3 2 2 1 2 2 2 2 3 1 1 2 1 1 2 1 3 2 3 2 2 2 2 2 3 2 1 1 2 1 2 2 2 3 2
## [2110] 3 2 2 2 2 1 1 1 2 2 1 1 2 1 2 2 1 3 1 2 2 2 2 2 2 2 3 3 2 2 3 2 1 2 2 2 1
## [2147] 3 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 3 1 2 1 1 1 1 2 2 2 1 3 3 1 1 3 1 2 2 1
## [2184] 2 1 1 2 1 2 1 2 2 1 1 2 2 2 2 3 2 2 2 2 2 2 3 3 3 2 2 3 2 2 2 1 2 3 2 2 2
## [2221] 2 2 2 2 2 2 1 2 2 2 3 2 1 2 1 2 3 2 2 3 2 3 2 2 2 2 2 1 2 2 1 3 1 2 2 2 2
## [2258] 2 2 2 1 2 1 1 2 3 1 2 2 2 2 2 2 2 3 1 2 1 3 1 3 1 2 3 2 1 2 3 1 3 1 1 2 2
## [2295] 1 2 2 2 1 2 2 3 2 2 3 2 1 2 2 1 2 3 2 2 2 1 2 2 1 3 2 1 2 1 2 1 1 1 2 1 3
## [2332] 3 2 2 1 3 1 2 1 2 3 2 1 2 1 1 1 3 3 2 2 2 2 2 2 3 2 1 3 2 2 2 3 2 1 2 2 2
## [2369] 3 1 2 2 2 1 2 2 2 2 1 3 1 2 1 2 2 1 1 1 3 2 3 2 1 2 2 3 3 3 2 1 1 2 1 2 1
## [2406] 3 2 3 3 1 2 2 2 2 2 2 3 2 2 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 1 2 1 2 2 3 2 2
## [2443] 3 2 2 1 2 2 2 3 1 2 3 2 1 1 2 2 3 2 3 1 2 2 3 3 2 3 1 1 1 1 1 2 3 1 2 2 3
## [2480] 1 2 2 2 1 3 1 1 2 2 2 3 1 2 1 1 2 2 2 2 2 1 2 3 2 2 2 1 1 2 2 2 1 1 3 2 2
## [2517] 2 2 2 2 2 1 3 1 2 2 2 1 3 2 1 2 1 2 2 2 1 1 2 1 2 2 2 2 2 1 2 3 3 2 2 2 2
## [2554] 1 2 2 3 3 2 2 2 2 2 1 2 3 2 2 2 3 2 2 2 2 2 2 3 2 1 2 2 2 2 2 2 1 2 2 2 3
## [2591] 3 2 2 2 2 1 1 3 2 2 2 2 2 3 2 3 2 2 1 2 2 2 1 2 1 1 2 1 3 2 2 3 1 1 1 2 1
## [2628] 2 2 2 3 2 2 2 2 1 2 2 2 1 3 1 1 1 1 2 2 1 3 1 2 2 2 2 3 2 1 2 2 2 1 2 3 1
## [2665] 1 2 1 2 2 2 1 1 2 2 2 2 2 1 2 2 2 3 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 2 1 2 2
## [2702] 1 2 3 2 2 3 1 2 1 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 3 1 3 1 2 1 2 2 3 3 2 3 2
## [2739] 2 2 1 2 2 2 2 1 1 3 3 3 1 2 1 1 1 3 2 2 2 3 3 2 2 2 1 2 1 1 2 2 1 3 3 1 3
## [2776] 2 2 2 1 2 2 1 2 2 1 1 2 1 3 2 1 2 1 2 2 3 1 2 2 2 2 2 2 2 3 3 2 2 3 1 2 1
## [2813] 2 2 2 2 1 2 3 2 2 1 3 2 3 3 1 1 2 1 2 2 2 2 2 2 1 2 2 3 2 3 1 2 2 2 2 2 2
## [2850] 2 2 2 1 1 2 2 2 2 3 2 3 3 2 3 1 2 2 2 2 2 1 2 2 2 2 2 2 3 2 1 3 2 3 1 3 3
## [2887] 2 2 3 2 1 3 3 2 2 2 2 3 2 2 2 1 2 1 3 2 1 2 3 1 1 2 2 3 2 2 2 2 2 2 2 2 3
## [2924] 2 2 2 1 2 2 3 1 1 3 3 3 2 3 2 3 3 2 2 2 3 1 1 2 3 2 2 2 1 1 2 2 2 2 2 3 1
## [2961] 2 2 1 2 2 2 1 1 3 1 2 1 1 2 3 3 2 1 1 3 1 1 2 3 2 3 2 3 2 3 1 2 2 2 2 2 2
## [2998] 1 3 3 1 3 3 1 2 1 2 3 1 1 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 3 2 1 1 2 3 1 1 1
## [3035] 1 2 1 2 1 2 2 2 2 1 1 1 3 2 3 2 2 1 2 2 3 2 1 2 2 2 2 3 2 2 3 2 2 3 3 2 2
## [3072] 2 1 1 1 2 3 2 3 3 2 3 2 2 1 2 1 3 3 2 3 3 2 2 2 2 3 2 1 1 2 2 2 2 2 2 2 2
## [3109] 2 2 2 3 2 2 2 2 2 1 2 1 2 2 1 3 2 3 3 2 2 1 3 2 2 2 2 2 3 2 2 1 2 3 1 1 3
## [3146] 2 2 2 2 2 1 2 1 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 2 3 2 2 2 2 1
## [3183] 2 2 2 2 3 1 2 2 2 2 1 2 3 2 1 1 2 1 3 2 3 1 1 2 2 3 1 1 2 2 2 1 2 2 3 2 1
## [3220] 2 3 2 2 3 3 2 2 2 2 3 2 2 3 2 2 3 2 1 2 3 3 2 2 1 2 2 1 2 3 3 1 2 2 2 1 1
## [3257] 2 2 2 3 1 2 2 1 3 2 3 1 2 2 2 2 3 1 2 3 2 2 1 2 2 3 3 2 2 2 2 1 2 2 2 2 2
## [3294] 2 2 2 2 3 2 2 2 2 2 3 2 1 3 2 1 2 2 2 2 2 2 2 1 2 1 1 1 3 1 1 2 3 2 2 1 2
## [3331] 2 3 3 1 2 2 2 2 2 1 1 1 1 2 3 2 2 1 3 2 2 2 1 1 2 3 2 2 1 2 3 2 2 3 3 2 2
## [3368] 3 2 1 3 2 3 2 1 3 2 3 2 2 3 2 3 2 2 2 2 1 3 2 2 1 1 1 2 1 3 2 2 2 2 1 2 2
## [3405] 3 2 2 1 3 1 3 2 2 1 2 2 2 1 3 2 3 2 2 2 3 2 3 3 2 2 1 2 1 2 2 1 3 3 2 2 2
## [3442] 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 3 3 1 1 1 2 3 2 2 3 2 3 1 2 2 2 2 2 3 1 2 2
## [3479] 3 2 2 2 2 1 2 2 3 2 3 2 2 3 2 2 3 3 2 1 2 1 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2
## [3516] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 3 2 2 2 2 3 2 1 1 2 1 3 2 3 2 2 2 2 1 2 2
## [3553] 2 2 2 1 2 2 2 3 2 2 2 1 2 2 2 1 2 2 2 2 2 1 2 2 3 2 2 2 2 3 2 2 2 2 1 3 2
## [3590] 2 2 2 2 2 3 2 2 2 1 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 3 1 2 2 3 2 1 2 1 2
## [3627] 3 2 2 1 3 3 2 1 2 1 2 1 2 2 1 3 2 2 3 2 2 1 2 1 2 2 2 2 2 3 2 2 2 2 3 2 2
## [3664] 2 2 2 1 2 2 2 1 1 1 2 2 2 2 2 2 2 1 2 1 2 3 1 2 1 3 2 2 3 1 2 1 3 1 3 3 1
## [3701] 2 2 2 1 1 2 2 2 1 2 1 2 1 2 2 3 2 2 2 2 1 2 2 3 2 3 1 2 3 2 1 3 3 2 1 1 3
## [3738] 3 3 1 1 3 2 1 2 1 2 2 3 1 3 2 2 1 1 2 2 2 3 2 2 2 2 2 2 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 885.6786 954.5825 844.0239
## (between_SS / total_SS = 64.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(clusters, data = as.data.frame(xy_scaled)) +
labs(title = "Clusters (Frecuencia vs Ticket Promedio)")
clientes_xy_final <- clientes_xy_clean %>%
mutate(cluster = as.factor(clusters$cluster))
perfil <- clientes_xy_final %>%
group_by(cluster) %>%
summarise(
Clientes = n(),
Avg_Frecuencia = mean(Frecuencia),
Avg_TicketProm = mean(Ticket_Promedio),
.groups = "drop"
) %>%
arrange(cluster)
perfil
## # A tibble: 3 × 4
## cluster Clientes Avg_Frecuencia Avg_TicketProm
## <fct> <int> <dbl> <dbl>
## 1 1 1002 2.30 488.
## 2 2 2089 1.93 194.
## 3 3 677 7.18 313.
A partir del análisis de clustering, se identificaron tres segmentos de clientes diferenciados con base en su frecuencia de compra y ticket promedio.
Cluster 1: Estos clientes presentan un alto gasto por compra, aunque su frecuencia es moderada. Representan un segmento altamente rentable para el negocio.
Estrategia sugerida: - Programas VIP - Estrategias de upselling y cross-selling - Acciones de retención
Cluster 2: Es el grupo más grande, caracterizado por baja frecuencia y bajo gasto. Representa una oportunidad de crecimiento si se logra incrementar su engagement.
Estrategia sugerida: - Promociones y descuentos - Campañas de reactivación - Incentivos para aumentar la frecuencia
Cluster 3: Este segmento se distingue por su alta frecuencia de compra, aunque con un gasto moderado por transacción.
Estrategia sugerida: - Programas de lealtad - Recompensas por frecuencia - Estrategias para incrementar el ticket promedio (bundles, combos)