1. Librerías

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(BTYD)
## Loading required package: hypergeo
## Loading required package: optimx
library(ggplot2)

2. Datos

set.seed(123)
transactions <- data.frame(
  cliente_id = sample(1:500, 3000, replace = TRUE),
  fecha = sample(seq(as.Date("2022-01-01"), as.Date("2025-01-01"), by="day"), 3000, replace = TRUE),
  monto = runif(3000, 10, 500)
)
transactions <- transactions %>% arrange(cliente_id, fecha)

3. RFM

end_date <- as.Date("2025-01-01")

rfm <- transactions %>%
  group_by(cliente_id) %>%
  summarise(
    frequency = n() - 1,
    recency = as.numeric(max(fecha) - min(fecha)),
    T = as.numeric(end_date - min(fecha)),
    monetary = mean(monto)
  )

4. Preparación BTYD (manteniendo cliente_id)

rfm_btyd <- rfm %>%
  select(cliente_id, frequency, recency, T) %>%
  rename(
    x = frequency,
    t.x = recency,
    T.cal = T
  ) %>%
  filter(
    x > 0,
    t.x > 0,
    T.cal > 0,
    t.x < T.cal
  ) %>%
  mutate(
    t.x = t.x / 365,
    T.cal = T.cal / 365
  ) %>%
  drop_na() %>%
  as.data.frame()

5. Modelo BG/NBD

bg_model <- bgnbd.EstimateParameters(
  rfm_btyd[, c("x","t.x","T.cal")]
)

6. Predicción de transacciones

rfm_btyd$pred_transacciones <- bgnbd.ConditionalExpectedTransactions(
  bg_model,
  t = 12,
  rfm_btyd$x,
  rfm_btyd$t.x,
  rfm_btyd$T.cal
)

7. Modelo monetario (regresión)

rfm_gg <- rfm %>%
  filter(frequency > 0, monetary > 0)

model_monetary <- lm(monetary ~ frequency, data = rfm_gg)
rfm_gg$pred_monetary <- predict(model_monetary, rfm_gg)

8. Integración modelos (JOIN)

rfm_final <- rfm_gg %>%
  left_join(
    rfm_btyd %>% select(cliente_id, pred_transacciones),
    by = "cliente_id"
  )

9. CLV

rfm_final <- rfm_final %>%
  mutate(
    CLV = pred_transacciones * pred_monetary
  )

10. Distribución CLV

ggplot(rfm_final, aes(CLV)) + 
  geom_histogram(bins = 40, fill = "steelblue", color = "black") +
  theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

11. Segmentación

q90 <- quantile(rfm_final$CLV, 0.90, na.rm = TRUE)
q70 <- quantile(rfm_final$CLV, 0.70, na.rm = TRUE)
q40 <- quantile(rfm_final$CLV, 0.40, na.rm = TRUE)

rfm_final <- rfm_final %>%
  mutate(
    segmento = case_when(
      CLV > q90 ~ "Top 10%",
      CLV > q70 ~ "High",
      CLV > q40 ~ "Medium",
      TRUE ~ "Low"
    )
  )

12. Top clientes

rfm_final %>% 
  arrange(desc(CLV)) %>% 
  slice_head(n = 10)
## # A tibble: 10 × 9
##    cliente_id frequency recency     T monetary pred_monetary pred_transacciones
##         <int>     <dbl>   <dbl> <dbl>    <dbl>         <dbl>              <dbl>
##  1        115        12    1043  1069     204.          248.               24.3
##  2        318        11     860   865     291.          249.               22.3
##  3        271        11    1027  1072     314.          249.               22.3
##  4        229        11     946  1037     192.          249.               22.3
##  5        471        11    1045  1094     239.          249.               22.3
##  6        424        11     988  1084     249.          249.               22.3
##  7         77        10     764   820     304.          250.               20.2
##  8        477        10     813   889     226.          250.               20.2
##  9        374        10     808   917     242.          250.               20.2
## 10        251        10     995  1018     201.          250.               20.2
## # ℹ 2 more variables: CLV <dbl>, segmento <chr>

13. Boxplot

ggplot(rfm_final, aes(x = segmento, y = CLV, fill = segmento)) + 
  geom_boxplot() +
  scale_y_log10() +
  theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

14. Insights

top_value <- round(sum(rfm_final$CLV[rfm_final$segmento == "Top 10%"]) / sum(rfm_final$CLV) * 100, 2)

cat(paste0(
  "El segmento Top 10% concentra aproximadamente el ", top_value, "% del CLV total. ",
  "Se recomienda enfocar estrategias de retención y personalización."
))
## El segmento Top 10% concentra aproximadamente el NA% del CLV total. Se recomienda enfocar estrategias de retención y personalización.