[HD] SING Supermarket Scene - Rosita

1 Paso 1. Instalar paquetes y llamar librerías

library(readxl)
library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(cluster)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(ggplot2)

2 Paso 2. Obtener los datos

ruta <- "/Users/nataliamartinez/Desktop/supermarket.xlsx"

# Leer hoja "supermercado"
df <- read_excel(ruta, sheet = "supermercado")

## Warning: Expecting numeric in A522063 / R522063C1: got 'A563185'

## Warning: Expecting numeric in A522064 / R522064C1: got 'A563186'

## Warning: Expecting numeric in A522065 / R522065C1: got 'A563187'

df <- as.data.table(df)

head(df)

##    BillNo                            Itemname Quantity       Date
##     <num>                              <char>    <num>     <POSc>
## 1: 536365  WHITE HANGING HEART T-LIGHT HOLDER        6 2010-12-01
## 2: 536365                 WHITE METAL LANTERN        6 2010-12-01
## 3: 536365      CREAM CUPID HEARTS COAT HANGER        8 2010-12-01
## 4: 536365 KNITTED UNION FLAG HOT WATER BOTTLE        6 2010-12-01
## 5: 536365      RED WOOLLY HOTTIE WHITE HEART.        6 2010-12-01
## 6: 536365        SET 7 BABUSHKA NESTING BOXES        2 2010-12-01
##                   Time Price CustomerID        Country
##                 <POSc> <num>      <num>         <char>
## 1: 1899-12-31 08:26:00  2.55      17850 United Kingdom
## 2: 1899-12-31 08:26:00  3.39      17850 United Kingdom
## 3: 1899-12-31 08:26:00  2.75      17850 United Kingdom
## 4: 1899-12-31 08:26:00  3.39      17850 United Kingdom
## 5: 1899-12-31 08:26:00  3.39      17850 United Kingdom
## 6: 1899-12-31 08:26:00  7.65      17850 United Kingdom

3 Paso 3. Analizar la base de datos

summary(df)

##      BillNo         Itemname            Quantity       
##  Min.   :536365   Length:522064      Min.   :-9600.00  
##  1st Qu.:547892   Class :character   1st Qu.:    1.00  
##  Median :560603   Mode  :character   Median :    3.00  
##  Mean   :559951                      Mean   :   10.09  
##  3rd Qu.:571892                      3rd Qu.:   10.00  
##  Max.   :581587                      Max.   :80995.00  
##  NA's   :3                                             
##       Date                             Time                       
##  Min.   :2010-12-01 00:00:00.00   Min.   :1899-12-31 06:20:00.00  
##  1st Qu.:2011-03-28 00:00:00.00   1st Qu.:1899-12-31 11:48:00.00  
##  Median :2011-07-20 00:00:00.00   Median :1899-12-31 13:37:00.00  
##  Mean   :2011-07-03 23:15:13.16   Mean   :1899-12-31 13:36:07.61  
##  3rd Qu.:2011-10-19 00:00:00.00   3rd Qu.:1899-12-31 15:30:00.00  
##  Max.   :2011-12-09 00:00:00.00   Max.   :1899-12-31 20:18:00.00  
##                                                                   
##      Price              CustomerID       Country         
##  Min.   :-11062.060   Min.   :12346    Length:522064     
##  1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Median :     2.080   Median :15265    Mode  :character  
##  Mean   :     3.827   Mean   :15317                      
##  3rd Qu.:     4.130   3rd Qu.:16837                      
##  Max.   : 13541.330   Max.   :18287                      
##                       NA's   :134041

str(df)

## Classes 'data.table' and 'data.frame':   522064 obs. of  8 variables:
##  $ BillNo    : num  536365 536365 536365 536365 536365 ...
##  $ Itemname  : chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity  : num  6 6 8 6 6 2 6 6 6 32 ...
##  $ Date      : POSIXct, format: "2010-12-01" "2010-12-01" ...
##  $ Time      : POSIXct, format: "1899-12-31 08:26:00" "1899-12-31 08:26:00" ...
##  $ Price     : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID: num  17850 17850 17850 17850 17850 ...
##  $ Country   : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  - attr(*, ".internal.selfref")=<externalptr>

# Revisión rápida de nulos por columna
sapply(df, function(x) sum(is.na(x)))

##     BillNo   Itemname   Quantity       Date       Time      Price CustomerID 
##          3       1455          0          0          0          0     134041 
##    Country 
##          0

4 Paso 4. Limpieza y preparación

df_clean <- df %>%
  filter(!is.na(CustomerID)) %>%
  filter(Quantity > 0, Price > 0) %>%
  mutate(
    CustomerID = as.character(CustomerID),
    Date = as.Date(Date),
    Total = Quantity * Price
  )
nrow(df); nrow(df_clean)

## [1] 522064

## [1] 387985

# 4.1 Limpieza básica de transacciones
df_clean <- df %>%
  filter(!is.na(CustomerID)) %>%
  filter(Quantity > 0, Price > 0) %>%
  mutate(
    CustomerID = as.character(CustomerID),
    Date = as.Date(Date),
    Total = Quantity * Price
  )

# 4.2 Total por ticket (factura) por cliente
tickets <- df_clean %>%
  group_by(CustomerID, BillNo) %>%
  summarise(
    ticket_total = sum(Total, na.rm = TRUE),
    .groups = "drop"
  )

# 4.3 Variables por cliente: X=Frecuencia, Y=Ticket promedio
clientes_xy <- tickets %>%
  group_by(CustomerID) %>%
  summarise(
    Frecuencia = n_distinct(BillNo),
    Ticket_Promedio = mean(ticket_total, na.rm = TRUE),
    .groups = "drop"
  )

# 4.4 Quitar outliers con IQR (en X y Y)
remove_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  x >= (Q1 - 1.5 * IQR_val) & x <= (Q3 + 1.5 * IQR_val)
}

clientes_xy_clean <- clientes_xy %>%
  filter(
    remove_outliers(Frecuencia),
    remove_outliers(Ticket_Promedio)
  )

cat("Clientes antes:", nrow(clientes_xy), "\n")

## Clientes antes: 4296

cat("Clientes después (sin outliers):", nrow(clientes_xy_clean), "\n")

## Clientes después (sin outliers): 3768

head(clientes_xy_clean)

## # A tibble: 6 × 3
##   CustomerID Frecuencia Ticket_Promedio
##   <chr>           <int>           <dbl>
## 1 12347               7            616.
## 2 12350               1            334.
## 3 12352               8            313.
## 4 12353               1             89 
## 5 12355               1            459.
## 6 12358               2            584.

5 Paso 5. Escalar los datos

xy_scaled <- scale(clientes_xy_clean[, c("Frecuencia", "Ticket_Promedio")])
xy_scaled <- as.matrix(xy_scaled)
storage.mode(xy_scaled) <- "double"

6 Paso 6. Determinar número de grupos

##6.1 Gap Statistic (optimización)

set.seed(123)

optimizacion <- clusGap(
  xy_scaled,
  FUN = kmeans,
  nstart = 25,
  K.max = 10,
  B = 50
)

## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

plot(optimizacion, xlab = "Número de clusters k", main = "Optimización (Gap Statistic)")

optimizacion$Tab

##           logW   E.logW       gap      SE.sim
##  [1,] 7.386883 7.985122 0.5982387 0.005178882
##  [2,] 7.112517 7.708169 0.5956522 0.006016135
##  [3,] 6.874417 7.491469 0.6170521 0.004938352
##  [4,] 6.723127 7.290705 0.5675780 0.005568678
##  [5,] 6.597957 7.195403 0.5974462 0.005481583
##  [6,] 6.507203 7.106547 0.5993444 0.005500824
##  [7,] 6.431542 7.025572 0.5940306 0.005246450
##  [8,] 6.369831 6.948125 0.5782942 0.005128150
##  [9,] 6.305740 6.880772 0.5750320 0.006267827
## [10,] 6.236537 6.830631 0.5940941 0.005177314

7 Paso 7. Generar los grupos

set.seed(123)
k <- 3
clusters <- kmeans(xy_scaled, centers = k, nstart = 25)
clusters

## K-means clustering with 3 clusters of sizes 1002, 2089, 677
## 
## Cluster means:
##   Frecuencia Ticket_Promedio
## 1 -0.2829982       1.2046814
## 2 -0.4369689      -0.6160310
## 3  1.7671968       0.1178698
## 
## Clustering vector:
##    [1] 3 2 3 2 1 1 2 3 2 2 2 1 1 1 1 3 3 2 2 3 1 1 1 2 2 2 3 1 1 2 2 3 1 2 2 3 1
##   [38] 1 2 1 2 1 1 2 2 2 1 1 2 2 1 2 3 1 1 1 1 3 1 1 1 1 3 1 1 3 3 1 1 2 3 1 2 2
##   [75] 3 2 3 1 2 2 1 2 2 1 2 1 1 1 1 3 2 1 1 2 3 3 1 2 3 1 2 1 1 1 2 1 1 1 2 2 2
##  [112] 2 3 2 2 2 1 2 3 1 2 1 2 1 2 2 3 2 1 1 2 3 2 2 1 2 2 3 2 3 3 3 2 2 1 2 1 3
##  [149] 1 1 2 1 1 1 2 3 1 2 2 2 3 3 2 1 3 1 1 1 2 1 3 1 1 1 2 2 2 2 2 1 1 2 2 3 2
##  [186] 1 2 2 1 3 2 2 3 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 1 1 1 2 2 3 2 2 2 1 3 3 2
##  [223] 1 2 1 2 1 2 2 1 1 2 3 2 2 1 2 3 3 3 1 3 2 2 2 2 2 2 1 2 1 1 2 1 2 1 2 2 2
##  [260] 1 2 2 2 2 1 3 1 3 2 3 2 2 2 1 2 1 2 2 1 1 3 2 2 1 2 2 3 1 2 3 1 1 2 1 1 3
##  [297] 3 2 2 1 2 1 2 2 1 2 2 1 1 2 1 2 2 2 2 2 2 1 1 1 3 3 1 2 1 1 3 2 1 2 2 1 3
##  [334] 2 2 2 3 2 1 2 1 2 2 1 3 3 3 1 3 1 2 3 2 3 2 3 1 2 2 2 2 1 2 2 1 2 1 2 1 2
##  [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 2 2 3 1 3 2 1 1 1 2 2 1 2 2 2 1 1 2 2 3 1
##  [408] 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 3 2 1 1 1 3 1 1 3
##  [445] 3 2 2 1 3 2 2 1 2 2 1 1 3 3 1 2 1 2 2 2 2 2 1 1 2 1 2 1 2 2 1 1 1 2 2 1 2
##  [482] 2 2 1 2 3 2 3 3 2 1 2 2 2 1 2 2 1 1 2 2 2 3 3 1 1 2 3 2 2 2 2 1 2 2 2 1 2
##  [519] 3 2 1 2 2 2 2 1 2 1 3 2 2 2 1 2 2 2 1 1 2 3 3 1 2 2 2 2 2 1 2 2 1 2 2 1 1
##  [556] 2 1 2 2 1 1 2 1 2 2 2 3 2 1 3 1 2 3 2 3 2 1 2 2 2 3 1 3 1 1 1 1 1 1 1 1 2
##  [593] 2 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 1 2 3 1 2 2 1 2 3 2 1 3 1 2 2
##  [630] 2 2 3 2 1 1 2 2 2 1 3 2 2 2 2 1 2 3 2 3 2 2 1 1 2 3 1 2 2 1 3 2 1 2 1 2 1
##  [667] 2 1 2 2 1 1 3 2 3 2 2 2 1 1 2 2 3 1 2 2 3 2 2 3 1 2 2 2 2 1 2 3 1 2 2 2 2
##  [704] 1 1 1 2 1 1 2 1 2 2 3 2 2 2 2 3 3 2 2 2 2 3 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2
##  [741] 1 2 2 1 2 1 3 2 3 1 2 3 1 1 2 2 1 1 2 2 1 3 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3
##  [778] 2 2 1 2 1 2 1 2 3 2 1 3 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 3 2 2 1 2 2 3
##  [815] 2 3 2 2 1 2 2 2 2 3 2 3 2 3 2 1 1 2 3 1 2 3 1 2 1 1 1 2 2 1 2 3 1 2 2 3 3
##  [852] 2 3 1 2 3 3 2 2 1 2 1 3 1 1 2 2 2 2 3 3 2 2 1 1 2 1 1 2 2 3 1 2 2 3 1 2 2
##  [889] 1 1 2 2 1 2 2 1 3 2 2 2 3 2 1 2 2 2 2 3 1 2 2 1 1 2 2 2 3 3 2 2 2 1 1 3 2
##  [926] 1 3 2 2 3 1 2 2 2 1 2 1 3 2 2 1 2 2 2 2 1 2 1 3 2 2 1 2 1 3 2 2 1 2 1 2 1
##  [963] 2 2 1 2 2 2 2 2 2 2 3 2 2 3 2 2 3 2 2 2 2 2 3 2 2 1 3 1 2 2 1 3 2 1 1 2 3
## [1000] 2 3 3 3 1 1 2 3 1 2 1 2 2 1 1 2 2 1 2 2 2 2 3 3 2 3 1 2 1 2 2 2 1 2 3 1 1
## [1037] 2 2 2 1 2 3 2 2 2 3 2 2 2 2 3 1 2 1 2 3 2 2 2 1 2 3 2 3 2 3 2 2 2 3 1 2 3
## [1074] 2 1 1 2 2 2 3 1 3 2 1 1 1 2 2 2 2 2 1 2 2 1 2 2 1 3 1 2 1 2 2 1 2 2 2 2 3
## [1111] 1 1 2 3 1 1 2 2 2 3 2 2 2 2 3 3 3 3 3 1 2 2 2 3 1 2 2 1 3 1 2 3 1 2 1 2 2
## [1148] 2 3 1 2 2 3 1 2 2 1 1 1 1 2 1 3 2 2 1 1 2 2 2 1 2 2 1 2 1 3 1 3 3 2 2 2 3
## [1185] 2 1 1 1 3 3 2 3 2 3 1 2 3 2 2 2 1 2 2 1 3 2 1 2 2 2 3 2 3 2 1 1 2 2 2 2 2
## [1222] 2 2 1 2 2 1 2 3 2 2 2 2 2 2 2 1 1 2 2 1 3 1 3 2 3 2 1 1 3 2 3 3 3 3 1 2 3
## [1259] 2 1 3 1 1 1 2 2 3 2 3 1 3 3 2 2 2 1 1 1 2 3 2 2 2 2 2 2 3 2 2 1 3 2 1 3 2
## [1296] 2 3 2 1 2 2 2 2 2 2 3 2 2 2 3 1 2 1 2 2 2 1 3 2 2 2 3 1 2 3 3 3 2 3 2 2 1
## [1333] 3 2 2 2 2 2 3 3 1 2 3 2 3 2 1 2 2 2 3 2 1 3 2 2 1 2 2 3 1 2 1 1 1 1 2 2 3
## [1370] 3 2 2 2 2 2 3 2 2 2 3 2 1 2 2 2 3 1 2 1 2 1 1 2 2 2 1 2 3 2 3 2 1 1 2 1 2
## [1407] 2 2 2 1 3 3 2 2 1 1 2 2 1 2 2 3 1 2 2 2 3 2 2 2 2 3 2 1 2 1 3 1 3 2 2 3 1
## [1444] 2 2 3 2 2 3 2 3 2 1 3 2 3 2 1 2 3 2 3 2 2 2 3 2 2 3 1 2 2 3 2 3 1 3 3 1 3
## [1481] 3 3 2 3 1 1 2 3 1 1 1 1 2 2 2 2 2 3 2 1 2 1 3 1 2 2 2 2 2 2 2 2 2 2 3 2 2
## [1518] 2 2 2 1 2 2 3 3 2 1 2 1 2 2 2 2 2 2 1 2 2 3 2 1 2 3 1 1 2 2 2 3 2 3 2 1 1
## [1555] 1 2 2 1 2 2 2 3 1 2 2 1 1 3 2 2 2 3 1 1 1 2 2 2 1 1 2 1 3 3 2 1 1 2 3 2 2
## [1592] 1 2 2 2 1 1 1 2 2 2 2 3 1 2 3 2 2 2 2 2 2 3 2 3 2 2 2 1 2 1 1 2 1 2 2 3 3
## [1629] 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 3 2 3 3 1 2 2 2 2 2 3 3 2 2 3 2 2
## [1666] 1 1 2 2 2 1 1 2 2 2 1 3 3 2 2 1 1 1 2 2 1 1 1 1 2 2 2 2 1 3 1 1 2 1 1 2 2
## [1703] 1 2 2 1 1 3 3 3 1 2 1 3 2 1 3 2 2 1 2 1 2 1 1 3 2 3 2 3 2 2 2 3 1 2 2 1 1
## [1740] 2 1 3 2 2 2 1 1 2 2 2 1 2 2 3 2 2 2 1 2 2 2 2 2 2 1 1 1 2 2 3 3 2 2 2 3 2
## [1777] 1 2 2 3 2 2 3 2 3 1 1 1 2 1 2 3 2 2 2 2 1 1 1 3 3 2 2 2 1 1 3 2 2 2 2 3 2
## [1814] 2 2 1 1 2 3 1 3 1 1 1 2 2 2 2 3 2 2 1 2 2 1 2 3 1 1 3 1 2 2 2 2 2 2 2 2 2
## [1851] 2 1 2 2 2 1 1 1 2 2 2 1 1 2 1 2 2 2 2 2 3 1 2 3 1 1 2 1 3 2 1 1 3 2 1 3 3
## [1888] 2 2 2 1 1 1 2 1 2 2 2 1 2 2 2 1 2 2 2 3 1 2 2 1 1 2 2 2 2 2 3 1 2 2 2 2 3
## [1925] 2 1 2 2 2 1 3 1 2 2 1 2 1 2 3 2 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 2 2 1 3 2 3
## [1962] 1 1 2 2 2 1 2 1 3 2 3 2 3 2 2 3 2 3 2 1 1 2 2 3 2 3 1 1 2 2 2 1 2 2 2 3 2
## [1999] 1 2 1 2 1 2 2 1 2 2 1 2 2 2 3 2 3 2 2 2 2 2 2 2 1 2 2 1 2 3 3 2 2 1 3 2 2
## [2036] 3 3 2 2 2 2 2 2 2 1 2 3 2 2 1 2 3 1 3 2 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 2 2
## [2073] 3 2 3 2 2 1 2 2 2 2 3 1 1 2 1 1 2 1 3 2 3 2 2 2 2 2 3 2 1 1 2 1 2 2 2 3 2
## [2110] 3 2 2 2 2 1 1 1 2 2 1 1 2 1 2 2 1 3 1 2 2 2 2 2 2 2 3 3 2 2 3 2 1 2 2 2 1
## [2147] 3 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 3 1 2 1 1 1 1 2 2 2 1 3 3 1 1 3 1 2 2 1
## [2184] 2 1 1 2 1 2 1 2 2 1 1 2 2 2 2 3 2 2 2 2 2 2 3 3 3 2 2 3 2 2 2 1 2 3 2 2 2
## [2221] 2 2 2 2 2 2 1 2 2 2 3 2 1 2 1 2 3 2 2 3 2 3 2 2 2 2 2 1 2 2 1 3 1 2 2 2 2
## [2258] 2 2 2 1 2 1 1 2 3 1 2 2 2 2 2 2 2 3 1 2 1 3 1 3 1 2 3 2 1 2 3 1 3 1 1 2 2
## [2295] 1 2 2 2 1 2 2 3 2 2 3 2 1 2 2 1 2 3 2 2 2 1 2 2 1 3 2 1 2 1 2 1 1 1 2 1 3
## [2332] 3 2 2 1 3 1 2 1 2 3 2 1 2 1 1 1 3 3 2 2 2 2 2 2 3 2 1 3 2 2 2 3 2 1 2 2 2
## [2369] 3 1 2 2 2 1 2 2 2 2 1 3 1 2 1 2 2 1 1 1 3 2 3 2 1 2 2 3 3 3 2 1 1 2 1 2 1
## [2406] 3 2 3 3 1 2 2 2 2 2 2 3 2 2 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 1 2 1 2 2 3 2 2
## [2443] 3 2 2 1 2 2 2 3 1 2 3 2 1 1 2 2 3 2 3 1 2 2 3 3 2 3 1 1 1 1 1 2 3 1 2 2 3
## [2480] 1 2 2 2 1 3 1 1 2 2 2 3 1 2 1 1 2 2 2 2 2 1 2 3 2 2 2 1 1 2 2 2 1 1 3 2 2
## [2517] 2 2 2 2 2 1 3 1 2 2 2 1 3 2 1 2 1 2 2 2 1 1 2 1 2 2 2 2 2 1 2 3 3 2 2 2 2
## [2554] 1 2 2 3 3 2 2 2 2 2 1 2 3 2 2 2 3 2 2 2 2 2 2 3 2 1 2 2 2 2 2 2 1 2 2 2 3
## [2591] 3 2 2 2 2 1 1 3 2 2 2 2 2 3 2 3 2 2 1 2 2 2 1 2 1 1 2 1 3 2 2 3 1 1 1 2 1
## [2628] 2 2 2 3 2 2 2 2 1 2 2 2 1 3 1 1 1 1 2 2 1 3 1 2 2 2 2 3 2 1 2 2 2 1 2 3 1
## [2665] 1 2 1 2 2 2 1 1 2 2 2 2 2 1 2 2 2 3 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 2 1 2 2
## [2702] 1 2 3 2 2 3 1 2 1 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 3 1 3 1 2 1 2 2 3 3 2 3 2
## [2739] 2 2 1 2 2 2 2 1 1 3 3 3 1 2 1 1 1 3 2 2 2 3 3 2 2 2 1 2 1 1 2 2 1 3 3 1 3
## [2776] 2 2 2 1 2 2 1 2 2 1 1 2 1 3 2 1 2 1 2 2 3 1 2 2 2 2 2 2 2 3 3 2 2 3 1 2 1
## [2813] 2 2 2 2 1 2 3 2 2 1 3 2 3 3 1 1 2 1 2 2 2 2 2 2 1 2 2 3 2 3 1 2 2 2 2 2 2
## [2850] 2 2 2 1 1 2 2 2 2 3 2 3 3 2 3 1 2 2 2 2 2 1 2 2 2 2 2 2 3 2 1 3 2 3 1 3 3
## [2887] 2 2 3 2 1 3 3 2 2 2 2 3 2 2 2 1 2 1 3 2 1 2 3 1 1 2 2 3 2 2 2 2 2 2 2 2 3
## [2924] 2 2 2 1 2 2 3 1 1 3 3 3 2 3 2 3 3 2 2 2 3 1 1 2 3 2 2 2 1 1 2 2 2 2 2 3 1
## [2961] 2 2 1 2 2 2 1 1 3 1 2 1 1 2 3 3 2 1 1 3 1 1 2 3 2 3 2 3 2 3 1 2 2 2 2 2 2
## [2998] 1 3 3 1 3 3 1 2 1 2 3 1 1 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 3 2 1 1 2 3 1 1 1
## [3035] 1 2 1 2 1 2 2 2 2 1 1 1 3 2 3 2 2 1 2 2 3 2 1 2 2 2 2 3 2 2 3 2 2 3 3 2 2
## [3072] 2 1 1 1 2 3 2 3 3 2 3 2 2 1 2 1 3 3 2 3 3 2 2 2 2 3 2 1 1 2 2 2 2 2 2 2 2
## [3109] 2 2 2 3 2 2 2 2 2 1 2 1 2 2 1 3 2 3 3 2 2 1 3 2 2 2 2 2 3 2 2 1 2 3 1 1 3
## [3146] 2 2 2 2 2 1 2 1 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 2 3 2 2 2 2 1
## [3183] 2 2 2 2 3 1 2 2 2 2 1 2 3 2 1 1 2 1 3 2 3 1 1 2 2 3 1 1 2 2 2 1 2 2 3 2 1
## [3220] 2 3 2 2 3 3 2 2 2 2 3 2 2 3 2 2 3 2 1 2 3 3 2 2 1 2 2 1 2 3 3 1 2 2 2 1 1
## [3257] 2 2 2 3 1 2 2 1 3 2 3 1 2 2 2 2 3 1 2 3 2 2 1 2 2 3 3 2 2 2 2 1 2 2 2 2 2
## [3294] 2 2 2 2 3 2 2 2 2 2 3 2 1 3 2 1 2 2 2 2 2 2 2 1 2 1 1 1 3 1 1 2 3 2 2 1 2
## [3331] 2 3 3 1 2 2 2 2 2 1 1 1 1 2 3 2 2 1 3 2 2 2 1 1 2 3 2 2 1 2 3 2 2 3 3 2 2
## [3368] 3 2 1 3 2 3 2 1 3 2 3 2 2 3 2 3 2 2 2 2 1 3 2 2 1 1 1 2 1 3 2 2 2 2 1 2 2
## [3405] 3 2 2 1 3 1 3 2 2 1 2 2 2 1 3 2 3 2 2 2 3 2 3 3 2 2 1 2 1 2 2 1 3 3 2 2 2
## [3442] 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 3 3 1 1 1 2 3 2 2 3 2 3 1 2 2 2 2 2 3 1 2 2
## [3479] 3 2 2 2 2 1 2 2 3 2 3 2 2 3 2 2 3 3 2 1 2 1 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2
## [3516] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 3 2 2 2 2 3 2 1 1 2 1 3 2 3 2 2 2 2 1 2 2
## [3553] 2 2 2 1 2 2 2 3 2 2 2 1 2 2 2 1 2 2 2 2 2 1 2 2 3 2 2 2 2 3 2 2 2 2 1 3 2
## [3590] 2 2 2 2 2 3 2 2 2 1 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 3 1 2 2 3 2 1 2 1 2
## [3627] 3 2 2 1 3 3 2 1 2 1 2 1 2 2 1 3 2 2 3 2 2 1 2 1 2 2 2 2 2 3 2 2 2 2 3 2 2
## [3664] 2 2 2 1 2 2 2 1 1 1 2 2 2 2 2 2 2 1 2 1 2 3 1 2 1 3 2 2 3 1 2 1 3 1 3 3 1
## [3701] 2 2 2 1 1 2 2 2 1 2 1 2 1 2 2 3 2 2 2 2 1 2 2 3 2 3 1 2 3 2 1 3 3 2 1 1 3
## [3738] 3 3 1 1 3 2 1 2 1 2 2 3 1 3 2 2 1 1 2 2 2 3 2 2 2 2 2 2 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 885.6786 954.5825 844.0239
##  (between_SS / total_SS =  64.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

8 Paso 8. Graficar los grupos

fviz_cluster(clusters, data = as.data.frame(xy_scaled)) +
  labs(title = "Clusters (Frecuencia vs Ticket Promedio)")

9 Paso 9. Agregar clusters a la base final

clientes_xy_final <- clientes_xy_clean %>%
  mutate(cluster = as.factor(clusters$cluster))

10 Paso 10. Perfiles

perfil <- clientes_xy_final %>%
  group_by(cluster) %>%
  summarise(
    Clientes = n(),
    Avg_Frecuencia = mean(Frecuencia),
    Avg_TicketProm = mean(Ticket_Promedio),
    .groups = "drop"
  ) %>%
  arrange(cluster)

perfil

## # A tibble: 3 × 4
##   cluster Clientes Avg_Frecuencia Avg_TicketProm
##   <fct>      <int>          <dbl>          <dbl>
## 1 1           1002           2.30           488.
## 2 2           2089           1.93           194.
## 3 3            677           7.18           313.

11 Paso 11. Conclusiones

11.1 Segmentación de Clientes

A partir del análisis de clustering, se identificaron tres segmentos de clientes diferenciados con base en su frecuencia de compra y ticket promedio.

Cluster 1: Estos clientes presentan un alto gasto por compra, aunque su frecuencia es moderada. Representan un segmento altamente rentable para el negocio.

Estrategia sugerida: - Programas VIP - Estrategias de upselling y cross-selling - Acciones de retención

Cluster 2: Es el grupo más grande, caracterizado por baja frecuencia y bajo gasto. Representa una oportunidad de crecimiento si se logra incrementar su engagement.

Estrategia sugerida: - Promociones y descuentos - Campañas de reactivación - Incentivos para aumentar la frecuencia

Cluster 3: Este segmento se distingue por su alta frecuencia de compra, aunque con un gasto moderado por transacción.

Estrategia sugerida: - Programas de lealtad - Recompensas por frecuencia - Estrategias para incrementar el ticket promedio (bundles, combos)

SuperMarket

Natalia Martinez Dominguez/Team 5

2026-02-23