Week5 Lab Sanghyun Hyun

# =========================
# Packages
# =========================
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(completejourney)

## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.

library(ggrepel)

# =========================
# Data: completejourney only
# =========================
USE_FULL <- TRUE  

coupons            <- completejourney::coupons            %>% clean_names()
coupon_redemptions <- completejourney::coupon_redemptions %>% clean_names()
products           <- completejourney::products           %>% clean_names()

transactions <- if (USE_FULL) {
  completejourney::get_transactions() %>% clean_names()
} else {
  completejourney::transactions_sample %>% clean_names()
}

# =========================
# Prepare basic fields
# =========================
transactions <- transactions %>%
  mutate(
    transaction_date = as_date(transaction_timestamp),
    paid = pmax(sales_value - coalesce(coupon_disc, 0), 0)  # Consumer paid price
  )

coupon_redemptions <- coupon_redemptions %>%
  mutate(redemption_date = as_date(redemption_date))

# =========================
# Step 1) Build safe joins
# - coupons: coupon-to-product mapping (deduplicated)
# - transactions: pre-aggregate to household × product × date
# =========================
coupons_map <- coupons %>% 
  distinct(campaign_id, coupon_upc, product_id)

tx_day <- transactions %>%
  group_by(household_id, product_id, transaction_date) %>%
  summarise(
    quantity           = sum(quantity, na.rm = TRUE),
    sales_value        = sum(sales_value, na.rm = TRUE),
    retail_disc        = sum(retail_disc, na.rm = TRUE),
    coupon_disc        = sum(coupon_disc, na.rm = TRUE),
    coupon_match_disc  = sum(coupon_match_disc, na.rm = TRUE),
    paid               = sum(pmax(sales_value - coalesce(coupon_disc, 0), 0), na.rm = TRUE),
    .groups = "drop"
  )

# =========================
# Step 2) Identify anchor redemption lines
# - match same household, same product, same day
# - if multiple matches: pick line with highest coupon_disc, then sales_value
# =========================
redeemed_candidates <- coupon_redemptions %>%
  inner_join(coupons_map, by = c("campaign_id", "coupon_upc")) %>%
  inner_join(
    tx_day %>% select(household_id, product_id, transaction_date, quantity,
                      sales_value, coupon_disc, coupon_match_disc, paid),
    by = c("household_id",
           "product_id" = "product_id",
           "redemption_date" = "transaction_date")
  )

## Warning in inner_join(., coupons_map, by = c("campaign_id", "coupon_upc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 90524 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

redeemed_lines <- redeemed_candidates %>%
  mutate(score1 = coalesce(coupon_disc, 0),
         score2 = coalesce(sales_value, 0)) %>%
  group_by(household_id, campaign_id, coupon_upc, redemption_date) %>%
  slice_max(order_by = score1, n = 1, with_ties = TRUE) %>%
  slice_max(order_by = score2, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  select(household_id, redemption_date, campaign_id, coupon_upc,
         product_id, quantity, sales_value, coupon_disc, coupon_match_disc, paid)

# =========================
# Step 3) Attach product metadata and anchor category
# =========================
redeemed_lines <- redeemed_lines %>%
  left_join(
    products %>% select(product_id, product_category, brand),
    by = "product_id"
  ) %>%
  filter(!is.na(product_category))

anchor_tbl <- redeemed_lines %>%
  group_by(household_id, product_category) %>%
  summarise(anchor_date = min(redemption_date), .groups = "drop")

# =========================
# Step 4) Extract Before/After windows
# - Before = anchor_date -28 ~ -1 days
# - After  = anchor_date ~ +7 days
# =========================
tx_with_meta <- transactions %>%
  left_join(products %>% select(product_id, product_category, brand), by = "product_id") %>%
  filter(!is.na(product_category), !is.na(brand)) %>%
  select(household_id, transaction_date, product_category, brand, sales_value, paid)

before_after_long <- anchor_tbl %>%
  inner_join(tx_with_meta, by = c("household_id", "product_category")) %>%
  mutate(
    period = case_when(
      transaction_date >= anchor_date - days(28) & transaction_date <= anchor_date - days(1) ~ "before",
      transaction_date >= anchor_date & transaction_date <= anchor_date + days(7)           ~ "after",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(period))

# =========================
# Step 5) Compute private label share
# =========================
share_by_hh_cat_period <- before_after_long %>%
  mutate(
    brand_lc   = stringr::str_to_lower(as.character(brand)),
    is_private = case_when(
      brand_lc %in% c("private", "private label", "store brand", "prvt") ~ TRUE,
      brand_lc %in% c("national") ~ FALSE,
      TRUE ~ FALSE
    )
  ) %>%
  group_by(household_id, product_category, period) %>%
  summarise(
    paid_total   = sum(paid, na.rm = TRUE),
    paid_private = sum(if_else(is_private, paid, 0), na.rm = TRUE),
    .groups = "drop_last"
  ) %>%
  mutate(
    share_private = if_else(paid_total > 0, paid_private / paid_total, NA_real_)
  ) %>%
  ungroup()

# =========================
# Step 6) Pair Before/After and compute delta
# =========================
delta_by_hh_cat <- share_by_hh_cat_period %>%
  select(household_id, product_category, period, share_private) %>%
  pivot_wider(names_from = period, values_from = share_private, values_fill = NA_real_) %>%
  filter(!is.na(before), !is.na(after)) %>%
  mutate(delta_share = after - before)

category_summary <- delta_by_hh_cat %>%
  group_by(product_category) %>%
  summarise(
    n_households = n(),
    mean_delta   = mean(delta_share, na.rm = TRUE),
    median_delta = median(delta_share, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_delta))

print(category_summary, n = 20)

## # A tibble: 122 × 4
##    product_category            n_households mean_delta median_delta
##    <chr>                              <int>      <dbl>        <dbl>
##  1 FROZEN MEAT                            1    0.75          0.75  
##  2 POTATOES                               7    0.338         0.0537
##  3 COFFEE                                 3    0.333         0     
##  4 CONDIMENTS/SAUCES                      6    0.302         0.134 
##  5 BAKED BREAD/BUNS/ROLLS                14    0.193         0.0378
##  6 FRZN VEGETABLE/VEG DSH                12    0.149         0     
##  7 DRY BN/VEG/POTATO/RICE                 4    0.139         0.123 
##  8 LUNCHMEAT                              7    0.122         0     
##  9 DRY NOODLES/PASTA                      6    0.106         0     
## 10 MOLASSES/SYRUP/PANCAKE MIXS            2    0.0906        0.0906
## 11 PORK                                   9    0.0765        0     
## 12 TOMATOES                               1    0.0627        0.0627
## 13 REFRIGERATED                           1    0.0537        0.0537
## 14 SOFT DRINKS                           28    0.0180        0     
## 15 FROZEN PIZZA                          30    0.0165        0     
## 16 SALAD MIX                             16    0.0153        0     
## 17 BATH TISSUES                          16    0.00888       0     
## 18 BEEF                                  39    0.00861       0     
## 19 ADULT INCONTINENCE                     1    0             0     
## 20 ANTACIDS                               3    0             0     
## # ℹ 102 more rows

# =========================
# Step 7) Slopegraph input
# =========================
slope_input <- share_by_hh_cat_period %>%
  group_by(product_category, period) %>%
  summarise(share_private_avg = mean(share_private, na.rm = TRUE), .groups = "drop") %>%
  mutate(period = factor(period, levels = c("before","after"))) %>%
  arrange(product_category, period)

# =========================
# Step 8) Visualization (Slopegraph)
# =========================
topN <- 12
top_cats <- category_summary %>% slice_max(n_households, n = topN) %>% pull(product_category)

plot_df <- slope_input %>%
  filter(product_category %in% top_cats) %>%
  group_by(product_category) %>%
  mutate(delta = share_private_avg[period=="after"] - share_private_avg[period=="before"]) %>%
  ungroup()

ggplot(plot_df, aes(x = period, y = share_private_avg, group = product_category)) +
  geom_line(aes(color = delta), linewidth = 1.1, alpha = 0.9) +
  geom_point(size = 2.2) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_color_gradient2(low = "#b2182b", mid = "grey50", high = "#2166ac", midpoint = 0,
                        name = "Δ Private\nShare (pp)") +
  labs(
    title = "Do coupons trigger brand switching?",
    subtitle = "Private label share before (-28~-1 days) vs. after (0~+7 days) coupon redemption\nTop 12 categories by household sample size",
    x = NULL, y = "Average Private Label Share",
    caption = "Data: completejourney (coupon_redemptions × coupons × products × transactions)\nDefinition: paid = sales_value − coupon_disc"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = element_blank(),
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(face = "bold")
  ) +
  ggrepel::geom_text_repel(
    data = plot_df %>% filter(period == "after") %>%
      slice_max(delta, n = 3),
    aes(label = paste0(product_category, " (", scales::percent(share_private_avg, accuracy = 1), ")")),
    nudge_x = 0.15, direction = "y", hjust = 0, segment.alpha = 0.3, size = 3.3, show.legend = FALSE
  )

# ========================
# Plot #2: Basket Size by Promotion Presence
# ========================
library(dplyr)
library(tidyr)
library(ggplot2)
library(janitor)

PROMO_FULL <- TRUE  

# ---- 1) Promotions: build with a guaranteed is_promoted ----
promotions_raw <- if (PROMO_FULL) {
  completejourney::get_promotions()
} else {
  completejourney::promotions_sample
}

promotions_tbl <- promotions_raw %>%
  clean_names() %>%
  mutate(
    product_id      = as.character(product_id),
    store_id        = as.character(store_id),
    week            = as.integer(week),
    display_location = as.character(display_location),
    mailer_location  = as.character(mailer_location),
    # NOT promoted only when shelf-only (display 0/A) AND not on mailer (0)
    is_promoted = if_else(
      display_location %in% c("0","A") & mailer_location == "0",
      FALSE, TRUE, missing = FALSE
    )
  ) %>%
  # keep only what we need AFTER creating is_promoted
  distinct(product_id, store_id, week, is_promoted)

# sanity check (optional)
# stopifnot("is_promoted" %in% names(promotions_tbl))

# ---- 2) Transactions: ensure key types match ----
tx_keys <- transactions %>%
  mutate(
    product_id = as.character(product_id),
    store_id   = as.character(store_id),
    week       = as.integer(week)
  ) %>%
  select(basket_id, store_id, week, product_id, quantity, sales_value)

# ---- 3) Join and flag ----
tx_flagged <- tx_keys %>%
  left_join(promotions_tbl, by = c("product_id","store_id","week")) %>%
  mutate(is_promoted = if_else(is.na(is_promoted), FALSE, is_promoted))

# ---- 4) Basket-level aggregation ----
basket_stats <- tx_flagged %>%
  group_by(basket_id) %>%
  summarise(
    basket_items = sum(quantity, na.rm = TRUE),
    basket_spend = sum(sales_value, na.rm = TRUE),
    any_promoted = any(is_promoted),
    .groups = "drop"
  )

plot_long <- basket_stats %>%
  pivot_longer(cols = c(basket_items, basket_spend),
               names_to = "metric", values_to = "value") %>%
  mutate(
    any_promoted = if_else(any_promoted, "With promotion", "No promotion"),
    metric = recode(metric,
                    basket_items = "Items per basket",
                    basket_spend = "Spend per basket ($)")
  )

# ---- 5) Figure ----
ggplot(plot_long, aes(x = any_promoted, y = value, fill = any_promoted)) +
  geom_violin(trim = TRUE, alpha = 0.7, linewidth = 0.2) +
  geom_boxplot(width = 0.18, outlier.shape = 21, alpha = 0.6) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 3,
               fill = "white", color = "black") +
  facet_wrap(~ metric, scales = "free_y") +
  scale_fill_manual(values = c("#a6bddb", "#2b8cbe"), guide = "none") +
  labs(
    title = "Do promotions enlarge baskets?",
    subtitle = "Distribution of basket size and spend by presence of any promoted line\n(Transactions × Promotions; product–store–week matching)",
    x = NULL, y = NULL,
    caption = "Promotions collapsed to unique product–store–week. Violin = distribution; box = IQR & median."
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = element_blank(),
    strip.text = element_text(face = "bold"),
    axis.text.x = element_text(face = "bold")
  )

# ========================
# Plot #3: Multi-SKU coupon concentration
# ========================

library(dplyr)
library(tidyr)
library(ggplot2)
library(janitor)

# ---- Coupon → Product pool (eligible SKUs per coupon) ----
coupon_pool <- coupons %>%
  clean_names() %>%
  distinct(campaign_id, coupon_upc, product_id) %>%
  group_by(campaign_id, coupon_upc) %>%
  summarise(pool_size = n_distinct(product_id), .groups = "drop")

# ---- Redeemed product on the redemption day (realized choice) ----
# Pre-aggregate to avoid duplicate lines (household × product × date)
tx_day <- transactions %>%
  group_by(household_id, product_id, transaction_date) %>%
  summarise(
    qty  = sum(quantity, na.rm = TRUE),
    paid = sum(pmax(sales_value - coalesce(coupon_disc, 0), 0), na.rm = TRUE),
    .groups = "drop"
  )

redeemed_candidates <- coupon_redemptions %>%
  clean_names() %>%
  mutate(redemption_date = lubridate::as_date(redemption_date)) %>%
  inner_join(
    coupons %>% clean_names() %>% distinct(campaign_id, coupon_upc, product_id),
    by = c("campaign_id", "coupon_upc")
  ) %>%
  inner_join(
    tx_day %>% select(household_id, product_id, transaction_date, qty, paid),
    by = c("household_id",
           "product_id" = "product_id",
           "redemption_date" = "transaction_date")
  )

## Warning in inner_join(., coupons %>% clean_names() %>% distinct(campaign_id, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 90524 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Pick one realized product per redemption:
# 1) highest paid (ties broken deterministically by first)
redeemed_realized <- redeemed_candidates %>%
  mutate(score = coalesce(paid, 0)) %>%
  group_by(household_id, campaign_id, coupon_upc, redemption_date) %>%
  slice_max(order_by = score, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  select(household_id, campaign_id, coupon_upc, redemption_date, product_id)

# Count realized choices per coupon
coupon_choice_counts <- redeemed_realized %>%
  count(campaign_id, coupon_upc, product_id, name = "chosen_n")

# ---- Concentration metrics per coupon ----
conc_tbl <- coupon_choice_counts %>%
  group_by(campaign_id, coupon_upc) %>%
  summarise(
    total_redemptions = sum(chosen_n),
    top1_chosen       = max(chosen_n),
    top1_share        = top1_chosen / total_redemptions,
    .groups = "drop"
  ) %>%
  inner_join(coupon_pool, by = c("campaign_id", "coupon_upc"))

# (Optional) label with the category/brand of the Top-1 product
top1_product <- coupon_choice_counts %>%
  group_by(campaign_id, coupon_upc) %>%
  slice_max(order_by = chosen_n, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  left_join(products %>% select(product_id, product_category, brand),
            by = "product_id")

conc_plot_df <- conc_tbl %>%
  left_join(top1_product %>% select(campaign_id, coupon_upc, product_category, brand),
            by = c("campaign_id", "coupon_upc")) %>%
  mutate(
    pool_size_bucket = cut(pool_size,
                           breaks = c(1, 5, 10, 25, 50, 100, Inf),
                           right = TRUE,
                           labels = c("≤5","6–10","11–25","26–50","51–100",">100"))
  )

# ---- Visualization ----
ggplot(conc_plot_df, aes(x = pool_size, y = top1_share)) +
  geom_point(aes(size = total_redemptions), alpha = 0.45) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1, color = "#2b8cbe") +
  scale_x_log10(labels = scales::comma_format()) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_size_continuous(name = "Total redemptions") +
  labs(
    title = "Multi-SKU coupons: how concentrated are actual choices?",
    subtitle = "Each point is a coupon: x = eligible SKU pool size (log scale), y = Top-1 chosen share among redemptions\nJoined tables: coupon_redemptions × coupons × transactions × products",
    x = "Eligible SKU pool size (log10 scale)",
    y = "Top-1 chosen share",
    caption = "Top-1 share = max(product redemption count)/total redemptions per coupon.\nBubble size = total redemptions."
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold")
  )

## `geom_smooth()` using formula = 'y ~ x'