# =========================
# Packages
# =========================
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggrepel)
# =========================
# Data: completejourney only
# =========================
USE_FULL <- TRUE
coupons <- completejourney::coupons %>% clean_names()
coupon_redemptions <- completejourney::coupon_redemptions %>% clean_names()
products <- completejourney::products %>% clean_names()
transactions <- if (USE_FULL) {
completejourney::get_transactions() %>% clean_names()
} else {
completejourney::transactions_sample %>% clean_names()
}
# =========================
# Prepare basic fields
# =========================
transactions <- transactions %>%
mutate(
transaction_date = as_date(transaction_timestamp),
paid = pmax(sales_value - coalesce(coupon_disc, 0), 0) # Consumer paid price
)
coupon_redemptions <- coupon_redemptions %>%
mutate(redemption_date = as_date(redemption_date))
# =========================
# Step 1) Build safe joins
# - coupons: coupon-to-product mapping (deduplicated)
# - transactions: pre-aggregate to household × product × date
# =========================
coupons_map <- coupons %>%
distinct(campaign_id, coupon_upc, product_id)
tx_day <- transactions %>%
group_by(household_id, product_id, transaction_date) %>%
summarise(
quantity = sum(quantity, na.rm = TRUE),
sales_value = sum(sales_value, na.rm = TRUE),
retail_disc = sum(retail_disc, na.rm = TRUE),
coupon_disc = sum(coupon_disc, na.rm = TRUE),
coupon_match_disc = sum(coupon_match_disc, na.rm = TRUE),
paid = sum(pmax(sales_value - coalesce(coupon_disc, 0), 0), na.rm = TRUE),
.groups = "drop"
)
# =========================
# Step 2) Identify anchor redemption lines
# - match same household, same product, same day
# - if multiple matches: pick line with highest coupon_disc, then sales_value
# =========================
redeemed_candidates <- coupon_redemptions %>%
inner_join(coupons_map, by = c("campaign_id", "coupon_upc")) %>%
inner_join(
tx_day %>% select(household_id, product_id, transaction_date, quantity,
sales_value, coupon_disc, coupon_match_disc, paid),
by = c("household_id",
"product_id" = "product_id",
"redemption_date" = "transaction_date")
)
## Warning in inner_join(., coupons_map, by = c("campaign_id", "coupon_upc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 90524 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
redeemed_lines <- redeemed_candidates %>%
mutate(score1 = coalesce(coupon_disc, 0),
score2 = coalesce(sales_value, 0)) %>%
group_by(household_id, campaign_id, coupon_upc, redemption_date) %>%
slice_max(order_by = score1, n = 1, with_ties = TRUE) %>%
slice_max(order_by = score2, n = 1, with_ties = FALSE) %>%
ungroup() %>%
select(household_id, redemption_date, campaign_id, coupon_upc,
product_id, quantity, sales_value, coupon_disc, coupon_match_disc, paid)
# =========================
# Step 3) Attach product metadata and anchor category
# =========================
redeemed_lines <- redeemed_lines %>%
left_join(
products %>% select(product_id, product_category, brand),
by = "product_id"
) %>%
filter(!is.na(product_category))
anchor_tbl <- redeemed_lines %>%
group_by(household_id, product_category) %>%
summarise(anchor_date = min(redemption_date), .groups = "drop")
# =========================
# Step 4) Extract Before/After windows
# - Before = anchor_date -28 ~ -1 days
# - After = anchor_date ~ +7 days
# =========================
tx_with_meta <- transactions %>%
left_join(products %>% select(product_id, product_category, brand), by = "product_id") %>%
filter(!is.na(product_category), !is.na(brand)) %>%
select(household_id, transaction_date, product_category, brand, sales_value, paid)
before_after_long <- anchor_tbl %>%
inner_join(tx_with_meta, by = c("household_id", "product_category")) %>%
mutate(
period = case_when(
transaction_date >= anchor_date - days(28) & transaction_date <= anchor_date - days(1) ~ "before",
transaction_date >= anchor_date & transaction_date <= anchor_date + days(7) ~ "after",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(period))
# =========================
# Step 5) Compute private label share
# =========================
share_by_hh_cat_period <- before_after_long %>%
mutate(
brand_lc = stringr::str_to_lower(as.character(brand)),
is_private = case_when(
brand_lc %in% c("private", "private label", "store brand", "prvt") ~ TRUE,
brand_lc %in% c("national") ~ FALSE,
TRUE ~ FALSE
)
) %>%
group_by(household_id, product_category, period) %>%
summarise(
paid_total = sum(paid, na.rm = TRUE),
paid_private = sum(if_else(is_private, paid, 0), na.rm = TRUE),
.groups = "drop_last"
) %>%
mutate(
share_private = if_else(paid_total > 0, paid_private / paid_total, NA_real_)
) %>%
ungroup()
# =========================
# Step 6) Pair Before/After and compute delta
# =========================
delta_by_hh_cat <- share_by_hh_cat_period %>%
select(household_id, product_category, period, share_private) %>%
pivot_wider(names_from = period, values_from = share_private, values_fill = NA_real_) %>%
filter(!is.na(before), !is.na(after)) %>%
mutate(delta_share = after - before)
category_summary <- delta_by_hh_cat %>%
group_by(product_category) %>%
summarise(
n_households = n(),
mean_delta = mean(delta_share, na.rm = TRUE),
median_delta = median(delta_share, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(mean_delta))
print(category_summary, n = 20)
## # A tibble: 122 × 4
## product_category n_households mean_delta median_delta
## <chr> <int> <dbl> <dbl>
## 1 FROZEN MEAT 1 0.75 0.75
## 2 POTATOES 7 0.338 0.0537
## 3 COFFEE 3 0.333 0
## 4 CONDIMENTS/SAUCES 6 0.302 0.134
## 5 BAKED BREAD/BUNS/ROLLS 14 0.193 0.0378
## 6 FRZN VEGETABLE/VEG DSH 12 0.149 0
## 7 DRY BN/VEG/POTATO/RICE 4 0.139 0.123
## 8 LUNCHMEAT 7 0.122 0
## 9 DRY NOODLES/PASTA 6 0.106 0
## 10 MOLASSES/SYRUP/PANCAKE MIXS 2 0.0906 0.0906
## 11 PORK 9 0.0765 0
## 12 TOMATOES 1 0.0627 0.0627
## 13 REFRIGERATED 1 0.0537 0.0537
## 14 SOFT DRINKS 28 0.0180 0
## 15 FROZEN PIZZA 30 0.0165 0
## 16 SALAD MIX 16 0.0153 0
## 17 BATH TISSUES 16 0.00888 0
## 18 BEEF 39 0.00861 0
## 19 ADULT INCONTINENCE 1 0 0
## 20 ANTACIDS 3 0 0
## # ℹ 102 more rows
# =========================
# Step 7) Slopegraph input
# =========================
slope_input <- share_by_hh_cat_period %>%
group_by(product_category, period) %>%
summarise(share_private_avg = mean(share_private, na.rm = TRUE), .groups = "drop") %>%
mutate(period = factor(period, levels = c("before","after"))) %>%
arrange(product_category, period)
# =========================
# Step 8) Visualization (Slopegraph)
# =========================
topN <- 12
top_cats <- category_summary %>% slice_max(n_households, n = topN) %>% pull(product_category)
plot_df <- slope_input %>%
filter(product_category %in% top_cats) %>%
group_by(product_category) %>%
mutate(delta = share_private_avg[period=="after"] - share_private_avg[period=="before"]) %>%
ungroup()
ggplot(plot_df, aes(x = period, y = share_private_avg, group = product_category)) +
geom_line(aes(color = delta), linewidth = 1.1, alpha = 0.9) +
geom_point(size = 2.2) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_color_gradient2(low = "#b2182b", mid = "grey50", high = "#2166ac", midpoint = 0,
name = "Δ Private\nShare (pp)") +
labs(
title = "Do coupons trigger brand switching?",
subtitle = "Private label share before (-28~-1 days) vs. after (0~+7 days) coupon redemption\nTop 12 categories by household sample size",
x = NULL, y = "Average Private Label Share",
caption = "Data: completejourney (coupon_redemptions × coupons × products × transactions)\nDefinition: paid = sales_value − coupon_disc"
) +
theme_minimal(base_size = 12) +
theme(
panel.grid.minor = element_blank(),
legend.position = "right",
plot.title = element_text(face = "bold"),
axis.text.x = element_text(face = "bold")
) +
ggrepel::geom_text_repel(
data = plot_df %>% filter(period == "after") %>%
slice_max(delta, n = 3),
aes(label = paste0(product_category, " (", scales::percent(share_private_avg, accuracy = 1), ")")),
nudge_x = 0.15, direction = "y", hjust = 0, segment.alpha = 0.3, size = 3.3, show.legend = FALSE
)

# ========================
# Plot #2: Basket Size by Promotion Presence
# ========================
library(dplyr)
library(tidyr)
library(ggplot2)
library(janitor)
PROMO_FULL <- TRUE
# ---- 1) Promotions: build with a guaranteed is_promoted ----
promotions_raw <- if (PROMO_FULL) {
completejourney::get_promotions()
} else {
completejourney::promotions_sample
}
promotions_tbl <- promotions_raw %>%
clean_names() %>%
mutate(
product_id = as.character(product_id),
store_id = as.character(store_id),
week = as.integer(week),
display_location = as.character(display_location),
mailer_location = as.character(mailer_location),
# NOT promoted only when shelf-only (display 0/A) AND not on mailer (0)
is_promoted = if_else(
display_location %in% c("0","A") & mailer_location == "0",
FALSE, TRUE, missing = FALSE
)
) %>%
# keep only what we need AFTER creating is_promoted
distinct(product_id, store_id, week, is_promoted)
# sanity check (optional)
# stopifnot("is_promoted" %in% names(promotions_tbl))
# ---- 2) Transactions: ensure key types match ----
tx_keys <- transactions %>%
mutate(
product_id = as.character(product_id),
store_id = as.character(store_id),
week = as.integer(week)
) %>%
select(basket_id, store_id, week, product_id, quantity, sales_value)
# ---- 3) Join and flag ----
tx_flagged <- tx_keys %>%
left_join(promotions_tbl, by = c("product_id","store_id","week")) %>%
mutate(is_promoted = if_else(is.na(is_promoted), FALSE, is_promoted))
# ---- 4) Basket-level aggregation ----
basket_stats <- tx_flagged %>%
group_by(basket_id) %>%
summarise(
basket_items = sum(quantity, na.rm = TRUE),
basket_spend = sum(sales_value, na.rm = TRUE),
any_promoted = any(is_promoted),
.groups = "drop"
)
plot_long <- basket_stats %>%
pivot_longer(cols = c(basket_items, basket_spend),
names_to = "metric", values_to = "value") %>%
mutate(
any_promoted = if_else(any_promoted, "With promotion", "No promotion"),
metric = recode(metric,
basket_items = "Items per basket",
basket_spend = "Spend per basket ($)")
)
# ---- 5) Figure ----
ggplot(plot_long, aes(x = any_promoted, y = value, fill = any_promoted)) +
geom_violin(trim = TRUE, alpha = 0.7, linewidth = 0.2) +
geom_boxplot(width = 0.18, outlier.shape = 21, alpha = 0.6) +
stat_summary(fun = median, geom = "point", shape = 23, size = 3,
fill = "white", color = "black") +
facet_wrap(~ metric, scales = "free_y") +
scale_fill_manual(values = c("#a6bddb", "#2b8cbe"), guide = "none") +
labs(
title = "Do promotions enlarge baskets?",
subtitle = "Distribution of basket size and spend by presence of any promoted line\n(Transactions × Promotions; product–store–week matching)",
x = NULL, y = NULL,
caption = "Promotions collapsed to unique product–store–week. Violin = distribution; box = IQR & median."
) +
theme_minimal(base_size = 12) +
theme(
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold"),
axis.text.x = element_text(face = "bold")
)

# ========================
# Plot #3: Multi-SKU coupon concentration
# ========================
library(dplyr)
library(tidyr)
library(ggplot2)
library(janitor)
# ---- Coupon → Product pool (eligible SKUs per coupon) ----
coupon_pool <- coupons %>%
clean_names() %>%
distinct(campaign_id, coupon_upc, product_id) %>%
group_by(campaign_id, coupon_upc) %>%
summarise(pool_size = n_distinct(product_id), .groups = "drop")
# ---- Redeemed product on the redemption day (realized choice) ----
# Pre-aggregate to avoid duplicate lines (household × product × date)
tx_day <- transactions %>%
group_by(household_id, product_id, transaction_date) %>%
summarise(
qty = sum(quantity, na.rm = TRUE),
paid = sum(pmax(sales_value - coalesce(coupon_disc, 0), 0), na.rm = TRUE),
.groups = "drop"
)
redeemed_candidates <- coupon_redemptions %>%
clean_names() %>%
mutate(redemption_date = lubridate::as_date(redemption_date)) %>%
inner_join(
coupons %>% clean_names() %>% distinct(campaign_id, coupon_upc, product_id),
by = c("campaign_id", "coupon_upc")
) %>%
inner_join(
tx_day %>% select(household_id, product_id, transaction_date, qty, paid),
by = c("household_id",
"product_id" = "product_id",
"redemption_date" = "transaction_date")
)
## Warning in inner_join(., coupons %>% clean_names() %>% distinct(campaign_id, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 90524 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Pick one realized product per redemption:
# 1) highest paid (ties broken deterministically by first)
redeemed_realized <- redeemed_candidates %>%
mutate(score = coalesce(paid, 0)) %>%
group_by(household_id, campaign_id, coupon_upc, redemption_date) %>%
slice_max(order_by = score, n = 1, with_ties = FALSE) %>%
ungroup() %>%
select(household_id, campaign_id, coupon_upc, redemption_date, product_id)
# Count realized choices per coupon
coupon_choice_counts <- redeemed_realized %>%
count(campaign_id, coupon_upc, product_id, name = "chosen_n")
# ---- Concentration metrics per coupon ----
conc_tbl <- coupon_choice_counts %>%
group_by(campaign_id, coupon_upc) %>%
summarise(
total_redemptions = sum(chosen_n),
top1_chosen = max(chosen_n),
top1_share = top1_chosen / total_redemptions,
.groups = "drop"
) %>%
inner_join(coupon_pool, by = c("campaign_id", "coupon_upc"))
# (Optional) label with the category/brand of the Top-1 product
top1_product <- coupon_choice_counts %>%
group_by(campaign_id, coupon_upc) %>%
slice_max(order_by = chosen_n, n = 1, with_ties = FALSE) %>%
ungroup() %>%
left_join(products %>% select(product_id, product_category, brand),
by = "product_id")
conc_plot_df <- conc_tbl %>%
left_join(top1_product %>% select(campaign_id, coupon_upc, product_category, brand),
by = c("campaign_id", "coupon_upc")) %>%
mutate(
pool_size_bucket = cut(pool_size,
breaks = c(1, 5, 10, 25, 50, 100, Inf),
right = TRUE,
labels = c("≤5","6–10","11–25","26–50","51–100",">100"))
)
# ---- Visualization ----
ggplot(conc_plot_df, aes(x = pool_size, y = top1_share)) +
geom_point(aes(size = total_redemptions), alpha = 0.45) +
geom_smooth(method = "loess", se = FALSE, linewidth = 1, color = "#2b8cbe") +
scale_x_log10(labels = scales::comma_format()) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_size_continuous(name = "Total redemptions") +
labs(
title = "Multi-SKU coupons: how concentrated are actual choices?",
subtitle = "Each point is a coupon: x = eligible SKU pool size (log scale), y = Top-1 chosen share among redemptions\nJoined tables: coupon_redemptions × coupons × transactions × products",
x = "Eligible SKU pool size (log10 scale)",
y = "Top-1 chosen share",
caption = "Top-1 share = max(product redemption count)/total redemptions per coupon.\nBubble size = total redemptions."
) +
theme_minimal(base_size = 12) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
