1 Introduction

Business question. Are health‑oriented products (e.g., organic, low‑fat, gluten‑free) growing faster among specific demographic segments, and do promotions/coupons amplify this growth—suggesting segments and timings for targeted expansion?

Why it matters. If we can identify who is shifting into health products—and when offers move the needle—we can target promotions, optimize assortment, and increase basket share of high‑margin categories.

Datasets used. Transactions, products, demographics, promotions, coupons, coupon redemptions, campaigns.

2 Libraries

library(tidyverse)      
library(lubridate)    
library(janitor)      
library(scales)       
library(glue)          
library(gt)             
library(patchwork)      

library(completejourney)

3 Import & audit

trans <- get_transactions()
prods <- products
house <- demographics
promos <- get_promotions()
cpn <- coupons
redeem <- coupon_redemptions
camps <- campaigns
camp_desc <- campaign_descriptions

# Clean names
trans <- clean_names(trans)
prods <- clean_names(prods)
house <- clean_names(house)
promos <- clean_names(promos)
cpn <- clean_names(cpn)
redeem <- clean_names(redeem)
camps <- clean_names(camps)
camp_desc <- clean_names(camp_desc)

# Timestamp + calendar fields
trans <- trans %>%
  mutate(
    transaction_timestamp = as.POSIXct(transaction_timestamp, tz = "UTC"),
    date  = as.Date(transaction_timestamp),
    week  = lubridate::floor_date(date, "week", week_start = 1),
    month = lubridate::floor_date(date, "month"),
    dow   = wday(date, label = TRUE, abbr = TRUE, week_start = 1)
  )

by_time <- if (identical(params$time_granularity, "week")) "week" else "month"

# Quick ranges & sizes
list(
  rows = list(
    transactions = nrow(trans), products = nrow(prods), households = nrow(house),
    promotions = nrow(promos), coupons = nrow(cpn), redemptions = nrow(redeem)
  ),
  date_range = trans %>% summarise(min = min(date), max = max(date))
)

## $rows
## $rows$transactions
## [1] 1469307
## 
## $rows$products
## [1] 92331
## 
## $rows$households
## [1] 801
## 
## $rows$promotions
## [1] 20940529
## 
## $rows$coupons
## [1] 116204
## 
## $rows$redemptions
## [1] 2102
## 
## 
## $date_range
## # A tibble: 1 × 2
##   min        max       
##   <date>     <date>    
## 1 2017-01-01 2018-01-01

4 Define health-oriented items

# Consolidate candidate text fields
prods2 <- prods %>%
  mutate(
    across(everything(), ~.x),
    text = paste(
      coalesce(brand, ""),
      coalesce(product_category, ""),
      coalesce(product_type, ""),
      coalesce(department, ""),
      sep = " "
    ) %>% str_to_lower()
  )

is_health_regex <- params$health_regex %>% tolower()

# Health flag via regex + optional taxonomy nudge
prods2 <- prods2 %>%
  mutate(
    is_health = str_detect(text, is_health_regex) |
      department %in% c("PRODUCE", "NATURAL/ORGANIC", "HEALTH & WELLNESS")
  )

# Keep a dictionary
health_dict <- prods2 %>%
  dplyr::select(dplyr::any_of(c(
    "product_id", "department", "brand",
    "product_category", "product_type",
    "is_health"
  )))

Reproducibility. Our is_health definition is transparent and editable via params$health_regex and the department whitelist.

5 Core join & enrich

# Join transactions to products & households
trx <- trans %>%
  left_join(health_dict, by = "product_id") %>%
  left_join(house, by = "household_id") %>%
  mutate(
    spend = (quantity * (retail_disc + coupon_disc + coupon_match_disc) + sales_value) %>% as.numeric(),
    spend = if_else(is.na(spend) | spend == 0, sales_value, spend),
    price_paid = if_else(quantity > 0, spend/quantity, NA_real_),
    level = dplyr::case_when(
  params$basket_level == "department" ~ department,
  params$basket_level %in% c("commodity", "product_category") ~ product_category,
  params$basket_level %in% c("sub_commodity", "product_type") ~ product_type,
  TRUE ~ product_category
)
)

# Segment features (coarse bands)
trx <- trx %>%
  mutate(
    income_band = fct_collapse(as.factor(income),
      Low = c("Under 15K","15-24K","25-34K"),
      Mid = c("35-49K","50-74K"),
      High = c("75-99K","100-124K","125-149K","150K+"),
      other_level = "Unknown"
    ),
    age_band = fct_collapse(as.factor(age),
      "<35" = c("19-24","25-34"),
      "35-54" = c("35-44","45-54"),
      "55+" = c("55-64","65+"),
      other_level = "Unknown"
    ),
    hh_size_band = case_when(
      is.na(household_size) ~ "Unknown",
      household_size <= 2 ~ "1-2",
      household_size <= 4 ~ "3-4",
      TRUE ~ "5+"
    ),
    has_kids = if_else(!is.na(kids_count) & kids_count > 0, "Kids in HH", "No kids")
  )

6 Health share by segment over time

# step 1: sum at is_health level
agg_core <- trx %>%
  group_by(.data[[by_time]], income_band, age_band, hh_size_band, has_kids, is_health) %>%
  summarise(revenue = sum(sales_value, na.rm = TRUE), .groups = "drop")

# step 2: pivot to get health/nonhealth side by side
health_ts <- agg_core %>%
  tidyr::pivot_wider(
    names_from = is_health, values_from = revenue,
    names_prefix = "is_health_", values_fill = 0
  ) %>%
  mutate(
    health_rev    = is_health_TRUE,
    nonhealth_rev = is_health_FALSE,
    total_rev     = health_rev + nonhealth_rev,
    health_share  = if_else(total_rev > 0, health_rev / total_rev, NA_real_)
  ) %>%
  select(.data[[by_time]], income_band, age_band, hh_size_band, has_kids,
         health_rev, nonhealth_rev, total_rev, health_share)

p_income <- health_ts %>%
  group_by(.data[[by_time]], income_band) %>%
  summarise(health_share = mean(health_share, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = .data[[by_time]], y = health_share, color = income_band)) +
  geom_line() +
  scale_y_continuous(labels = percent_format()) +
  labs(title = "Health-oriented share over time by income band", x = NULL, y = "Health share of revenue") +
  theme(legend.position = "bottom")

p_kids <- health_ts %>%
  group_by(.data[[by_time]], has_kids) %>%
  summarise(health_share = mean(health_share, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = .data[[by_time]], y = health_share, color = has_kids)) +
  geom_line() +
  scale_y_continuous(labels = percent_format()) +
  labs(title = "Health-oriented share over time by household kids", x = NULL, y = "Health share of revenue") +
  theme(legend.position = "bottom")

p_income / p_kids

6.1 Rank segments by growth

# Simple linear trend slope of health_share ~ time index per segment
seg_ts <- health_ts %>%
  drop_na(health_share) %>%
  group_by(income_band, age_band, hh_size_band, has_kids) %>%
  arrange(.data[[by_time]]) %>%
  mutate(t = row_number())

trend_tbl <- seg_ts %>%
  do({
    m <- lm(health_share ~ t, data = .)
    tibble(
      n_periods = nrow(.),
      avg_share = mean(.$health_share, na.rm = TRUE),
      slope_per_period = coef(m)[["t"]],
      r2 = summary(m)$r.squared
    )
  }) %>%
  ungroup() %>%
  arrange(desc(slope_per_period))

gt(head(trend_tbl, 12)) %>%
  fmt_percent(columns = c(avg_share, slope_per_period), decimals = 2) %>%
  tab_header(title = "Fastest-growing segments by health share trend")

income_band	age_band	hh_size_band	has_kids	n_periods	avg_share	slope_per_period	r2
Fastest-growing segments by health share trend
High	<35	1-2	Kids in HH	13	4.31%	0.42%	0.0977656794
Low	<35	1-2	No kids	54	8.50%	0.10%	0.0608576191
Low	55+	3-4	Kids in HH	53	13.04%	0.08%	0.0393267742
Low	55+	1-2	Kids in HH	53	5.47%	0.07%	0.0701190631
High	55+	3-4	Kids in HH	52	9.09%	0.04%	0.0136796745
Low	<35	1-2	Kids in HH	52	6.12%	0.03%	0.0276530882
Mid	<35	1-2	No kids	54	8.12%	0.02%	0.0773661887
Unknown	35-54	1-2	Kids in HH	52	15.21%	0.02%	0.0028603619
Low	35-54	5+	Kids in HH	53	6.97%	0.02%	0.0097252381
Unknown	<35	1-2	No kids	53	10.36%	0.02%	0.0115194863
Unknown	<35	3-4	Kids in HH	53	10.32%	0.01%	0.0010856287
Mid	<35	1-2	Kids in HH	52	6.20%	0.01%	0.0005427847

7 Promotions

# Helper: check columns + coerce flags safely
has_cols <- function(df, cols) all(cols %in% names(df))
flagify <- function(x) { if (!is.null(x)) (!is.na(x) & x != "") else rep(FALSE, nrow(promos)) }

# Build trx_prom depending on promos schema
if (has_cols(promos, c("start_date", "end_date"))) {
  promos2 <- promos %>%
    mutate(
      start_date = as.Date(start_date),
      end_date   = as.Date(end_date),
      # handle string/NA indicators for display/mailer
      display = if ("display" %in% names(.)) flagify(display) else FALSE,
      mailer  = if ("mailer"  %in% names(.)) flagify(mailer)  else FALSE
    ) %>%
    select(product_id, store_id, start_date, end_date, display, mailer)

  if (isTRUE(params$promo_join_by_store)) {
    trx_prom <- trx %>%
      left_join(promos2, by = c("product_id","store_id")) %>%
      mutate(on_promo = !is.na(start_date) & date >= start_date & date <= end_date & (display | mailer))
  } else {
    promos3 <- promos2 %>% select(product_id, start_date, end_date, display, mailer) %>% distinct()
    trx_prom <- trx %>%
      left_join(promos3, by = "product_id") %>%
      mutate(on_promo = !is.na(start_date) & date >= start_date & date <= end_date & (display | mailer))
  }

} else if (has_cols(promos, c("display_location","mailer_location","week"))) {
  promos2 <- promos %>%
    mutate(
      week    = as.Date(week),
      display = flagify(display_location),
      mailer  = flagify(mailer_location)
    ) %>%
    select(product_id, store_id, week, display, mailer)

  if (isTRUE(params$promo_join_by_store)) {
    trx_prom <- trx %>%
      left_join(promos2, by = c("product_id","store_id","week")) %>%
      mutate(on_promo = coalesce(display | mailer, FALSE))
  } else {
    promos3 <- promos2 %>% select(product_id, week, display, mailer) %>% distinct()
    trx_prom <- trx %>%
      left_join(promos3, by = c("product_id","week")) %>%
      mutate(on_promo = coalesce(display | mailer, FALSE))
  }

} else {
  stop("Unknown promotions schema. Columns found: ", paste(names(promos), collapse = ", "))
}

# --- Summaries & viz (works for either schema) ------------------------------
promo_effect <- trx_prom %>%
  group_by(on_promo, is_health) %>%
  summarise(
    revenue = sum(sales_value, na.rm = TRUE),
    qty     = sum(quantity, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(on_promo) %>%
  mutate(share = revenue / sum(revenue))

gt(promo_effect) %>% tab_header(title = "Promo vs. non-promo revenue & share (overall)")

is_health	revenue	qty	share
Promo vs. non-promo revenue & share (overall)
FALSE
FALSE	4207551.64	152718359	9.154733e-01
TRUE	388315.37	204708	8.448913e-02
NA	172.57	41	3.754754e-05

# Segment-specific: weekend emphasis
promo_seg <- trx_prom %>%
  mutate(is_weekend = dow %in% c("Sat","Sun")) %>%
  group_by(is_weekend, income_band, on_promo) %>%
  summarise(
    health_rev = sum(if_else(is_health, sales_value, 0), na.rm = TRUE),
    total_rev  = sum(sales_value, na.rm = TRUE),
    .groups    = "drop"
  ) %>%
  mutate(health_share = if_else(total_rev > 0, health_rev / total_rev, NA_real_))

p_promo <- promo_seg %>%
  ggplot(aes(x = income_band, y = health_share, fill = on_promo)) +
  geom_col(position = position_dodge()) +
  facet_wrap(~is_weekend) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(title = "Health share by income band: promo vs. non-promo (weekday/weekend)",
       x = NULL, y = "Health share") +
  theme(legend.position = "bottom")

p_promo

8 Coupons: redeemers, pre/post change, and segment effects

# Identify coupons tied to health products
cpn2 <- cpn %>% left_join(health_dict, by = "product_id") %>% mutate(is_health_coupon = is_health %in% TRUE)

# First redemption date per household
first_red <- redeem %>%
  left_join(cpn2 %>% select(coupon_upc, is_health_coupon), by = "coupon_upc") %>%
  filter(is_health_coupon %in% TRUE) %>%
  mutate(redemption_date = as.Date(redemption_date)) %>%
  group_by(household_id) %>%
  summarise(first_health_coupon = min(redemption_date), .groups = "drop")

# Compute household-level health share before/after first redemption (4-week windows)
trx_hh <- trx %>% select(household_id, date, sales_value, is_health)

prepost <- first_red %>%
  left_join(trx_hh, by = "household_id") %>%
  mutate(
    rel = as.integer(date - first_health_coupon),
    window = case_when(rel >= -28 & rel < 0 ~ "Pre (−4w)", rel >= 0 & rel <= 28 ~ "Post (+4w)", TRUE ~ NA_character_)
  ) %>%
  filter(!is.na(window)) %>%
  group_by(household_id, window) %>%
  summarise(health_rev = sum(if_else(is_health, sales_value, 0), na.rm=TRUE),
            total_rev = sum(sales_value, na.rm=TRUE), .groups = "drop") %>%
  mutate(health_share = if_else(total_rev>0, health_rev/total_rev, NA_real_))

prepost_summary <- prepost %>%
  group_by(window) %>%
  summarise(avg_health_share = mean(health_share, na.rm = TRUE), n = n_distinct(household_id), .groups = "drop")

gt(prepost_summary) %>%
  fmt_percent(columns = avg_health_share, decimals = 2) %>%
  tab_header(title = "Health coupon redeemers: avg health share pre vs post (4-week windows)")

window	avg_health_share	n
Health coupon redeemers: avg health share pre vs post (4-week windows)
Post (+4w)	9.04%	267
Pre (−4w)	9.09%	265

# Compare redeemers vs non-redeemers within income segments (overall period)
redeemer_flag <- first_red %>% mutate(redeemer = TRUE)

comp_seg <- trx %>%
  left_join(redeemer_flag, by = "household_id") %>%
  mutate(redeemer = coalesce(redeemer, FALSE)) %>%
  group_by(income_band, redeemer, household_id) %>%
  summarise(health_rev = sum(if_else(is_health, sales_value, 0), na.rm=TRUE), total_rev = sum(sales_value, na.rm=TRUE), .groups = "drop") %>%
  mutate(health_share = if_else(total_rev>0, health_rev/total_rev, NA_real_)) %>%
  group_by(income_band, redeemer) %>%
  summarise(avg_health_share = mean(health_share, na.rm=TRUE), n_hh = n_distinct(household_id), .groups = "drop")

gt(comp_seg) %>% fmt_percent(columns = avg_health_share, decimals = 2) %>% tab_header(title = "Redeemers vs non-redeemers by income band")

income_band	redeemer	avg_health_share	n_hh
Redeemers vs non-redeemers by income band
Low	FALSE	7.89%	176
Low	TRUE	7.57%	36
Mid	FALSE	8.14%	264
Mid	TRUE	8.58%	100
High	FALSE	9.72%	120
High	TRUE	9.89%	48
Unknown	FALSE	11.92%	38
Unknown	TRUE	12.64%	19
NA	FALSE	8.40%	1604
NA	TRUE	8.29%	64

9 Basket composition when a health anchor is present

library(rlang)

# figure out the basket (trip) id column name
basket_col <- dplyr::case_when(
  "basket_id"       %in% names(trx) ~ "basket_id",
  "transaction_id"  %in% names(trx) ~ "transaction_id",
  TRUE ~ NA_character_
)

if (is.na(basket_col)) {
  stop("No basket-level identifier found. Looked for 'basket_id' or 'transaction_id' in trx.")
}

# make sure is_health exists and is logical
trx <- trx %>% mutate(is_health = coalesce(is_health, FALSE))

# items at basket level
basket_items <- trx %>%
  dplyr::select(!!sym(basket_col), item = level, is_health) %>%
  distinct()

# mark baskets that contain at least one health item
health_baskets <- basket_items %>%
  group_by(!!sym(basket_col)) %>%
  summarise(has_health = any(is_health), .groups = "drop")

# attach-rate by category when basket has any health item vs not
attach_tbl <- basket_items %>%
  left_join(health_baskets, by = basket_col) %>%
  group_by(item, has_health) %>%
  summarise(baskets = n_distinct(!!sym(basket_col)), .groups = "drop") %>%
  group_by(item) %>%
  mutate(attach_rate = baskets / sum(baskets)) %>%
  tidyr::pivot_wider(
    names_from = has_health,
    values_from = attach_rate,
    names_prefix = "has_health_",
    values_fill = 0
  ) %>%
  mutate(delta = has_health_TRUE - has_health_FALSE) %>%
  arrange(desc(delta))

gt(head(attach_tbl, 15)) %>%
  tab_header(title = "Categories that over-index when health items are in the basket")

baskets	has_health_FALSE	has_health_TRUE	delta
Categories that over-index when health items are in the basket
PROD-WHS SALES
3	0	1.0000000	1.0000000
PRODUCE
50756	0	1.0000000	1.0000000
FROZEN GROCERY
141	0	0.9929577	0.9929577
TRAVEL & LEISURE
413	0	0.8787234	0.8787234
CHEF SHOPPE
511	0	0.8720137	0.8720137
SEAFOOD
1498	0	0.7520080	0.7520080
MEAT
22411	0	0.7452447	0.7452447
NUTRITION
7211	0	0.7321555	0.7321555
SEAFOOD-PCKGD
3688	0	0.7242734	0.7242734
GM MERCH EXP
28	0	0.7179487	0.7179487
MEAT-PCKGD
21320	0	0.6471785	0.6471785
NA
2803	0	0.6231659	0.6231659
DELI
12351	0	0.6175809	0.6175809
POSTAL CENTER
3	0	0.6000000	0.6000000
PHOTO & VIDEO
10	0	0.5882353	0.5882353

10 Pairwise co-occurrence

items_df <- trx %>%
  dplyr::select(basket_id_var = !!sym(basket_col), item = level) %>%
  filter(!is.na(basket_id_var), !is.na(item)) %>%
  distinct()

n_baskets <- n_distinct(items_df$basket_id_var)

item_ct <- items_df %>% count(item, name = "n_item")

basket_pairs <- items_df %>%
  group_by(basket_id_var) %>%
  summarise(items = list(sort(unique(item))), .groups = "drop") %>%
  mutate(pairs = purrr::map(items, function(x) {
    if (length(x) < 2) return(tibble(lhs = character(), rhs = character()))
    m <- t(combn(x, 2))
    tibble(lhs = m[, 1], rhs = m[, 2])
  })) %>%
  select(-items) %>%
  tidyr::unnest(pairs)

pair_ct <- basket_pairs %>% count(lhs, rhs, name = "n_pair")

# directional rules both ways
rules_tbl <- pair_ct %>%
  left_join(item_ct %>% rename(lhs = item, n_lhs = n_item), by = "lhs") %>%
  left_join(item_ct %>% rename(rhs = item, n_rhs = n_item), by = "rhs") %>%
  mutate(
    support       = n_pair / n_baskets,
    conf_lhs_rhs  = n_pair / n_lhs,
    base_rhs      = n_rhs / n_baskets,
    lift_lhs_rhs  = conf_lhs_rhs / base_rhs
  ) %>%
  transmute(lhs, rhs, n_pair, support, confidence = conf_lhs_rhs, lift = lift_lhs_rhs) %>%
  bind_rows(
    pair_ct %>% rename(lhs = rhs, rhs = lhs) %>%
      left_join(item_ct %>% rename(lhs = item, n_lhs = n_item), by = "lhs") %>%
      left_join(item_ct %>% rename(rhs = item, n_rhs = n_item), by = "rhs") %>%
      mutate(
        support       = n_pair / n_baskets,
        conf_lhs_rhs  = n_pair / n_lhs,
        base_rhs      = n_rhs / n_baskets,
        lift_lhs_rhs  = conf_lhs_rhs / base_rhs
      ) %>%
      transmute(lhs, rhs, n_pair, support, confidence = conf_lhs_rhs, lift = lift_lhs_rhs)
  ) %>%
  arrange(desc(lift))

# screen by your params
opp <- rules_tbl %>%
  filter(support >= params$min_support, lift >= params$min_lift) %>%
  distinct(lhs, rhs, .keep_all = TRUE)

gt(head(opp, 20)) %>%
  fmt_percent(columns = c(support, confidence), decimals = 1) %>%
  tab_header(title = "Top co-occurring pairs by lift (no arules)")

lhs	rhs	n_pair	support	confidence	lift
Top co-occurring pairs by lift (no arules)
SEAFOOD	SEAFOOD-PCKGD	472	0.3%	23.7%	7.244823
SEAFOOD-PCKGD	SEAFOOD	472	0.3%	9.3%	7.244823
CHEF SHOPPE	MEAT	352	0.2%	60.1%	3.109899
MEAT	CHEF SHOPPE	352	0.2%	1.2%	3.109899
NUTRITION	SEAFOOD	389	0.2%	3.9%	3.086965
SEAFOOD	NUTRITION	389	0.2%	19.5%	3.086965
PRODUCE	TRAVEL & LEISURE	405	0.3%	0.8%	2.643220
TRAVEL & LEISURE	PRODUCE	405	0.3%	86.2%	2.643220
PRODUCE	CHEF SHOPPE	498	0.3%	1.0%	2.606801
CHEF SHOPPE	PRODUCE	498	0.3%	85.0%	2.606801
MEAT-PCKGD	SEAFOOD-PCKGD	2799	1.8%	8.5%	2.597855
SEAFOOD-PCKGD	MEAT-PCKGD	2799	1.8%	55.0%	2.597855
SEAFOOD-PCKGD	MEAT	2531	1.6%	49.7%	2.573386
MEAT	SEAFOOD-PCKGD	2531	1.6%	8.4%	2.573386
SEAFOOD	MEAT	938	0.6%	47.1%	2.437893
MEAT	SEAFOOD	938	0.6%	3.1%	2.437893
NUTRITION	SEAFOOD-PCKGD	764	0.5%	7.8%	2.371791
SEAFOOD-PCKGD	NUTRITION	764	0.5%	15.0%	2.371791
MEAT	MEAT-PCKGD	14603	9.4%	48.6%	2.294986
MEAT-PCKGD	MEAT	14603	9.4%	44.3%	2.294986

11 Executive summary & recommendations

fmt_pct <- function(x, d = 2) ifelse(is.na(x), "—", percent(x, accuracy = 1/10^d))
fmt_pp  <- function(x, d = 2) ifelse(is.na(x), "—", paste0(round(100*x, d), " pp"))

# --- Top 2–3 segments (from trend_tbl) ---
trend_ok <- exists("trend_tbl") && is.data.frame(trend_tbl) && nrow(trend_tbl) > 0
top_txt <- if (trend_ok) {
  top3 <- trend_tbl %>%
    mutate(Segment = paste(income_band, age_band, hh_size_band, has_kids, sep = " / ")) %>%
    arrange(desc(slope_per_period)) %>%
    slice_head(n = 3)
  paste(glue("{top3$Segment} (slope ≈ {round(100*top3$slope_per_period, 2)}%-pts/period)"), collapse = "; ")
} else {
  "See the Fastest-growing segments table above."
}

# --- Promo effect (prefer promo_effect; fallback to promo_seg) ---
promo_line <- {
  if (exists("promo_effect") && nrow(promo_effect) > 0) {
    pe <- promo_effect %>%
      group_by(on_promo, is_health) %>%
      summarise(revenue = sum(revenue, na.rm = TRUE), .groups = "drop") %>%
      group_by(on_promo) %>%
      mutate(share = revenue / sum(revenue)) %>%
      ungroup() %>%
      filter(is_health) %>%
      mutate(on_promo = if_else(on_promo, "Promo", "No promo")) %>%
      tidyr::pivot_wider(names_from = on_promo, values_from = share)
    hs_promo <- pe$Promo %||% NA_real_
    hs_nop   <- pe$`No promo` %||% NA_real_
    if (is.na(hs_promo) || is.na(hs_nop)) {
      "- **What moves them?** Use the weekday/weekend + segment facets in *Promotions* to time offers (no valid on/off promo comparison in aggregate)."
    } else {
      glue("- **What moves them?** Health share **on promo** {fmt_pct(hs_promo)} vs **no promo** {fmt_pct(hs_nop)} (Δ {fmt_pp(hs_promo - hs_nop)}). ",
           "Use the weekday/weekend + segment facets in *Promotions* to time offers.")
    }
  } else if (exists("promo_seg") && nrow(promo_seg) > 0) {
    # fallback: summarize weekend vs weekday lift among on_promo only
    ps <- promo_seg %>%
      filter(on_promo %in% TRUE) %>%
      summarise(weekend = mean(health_share[dplyr::coalesce(is_weekend, FALSE)], na.rm = TRUE),
                weekday = mean(health_share[!dplyr::coalesce(is_weekend, FALSE)], na.rm = TRUE))
    if (is.finite(ps$weekend) && is.finite(ps$weekday)) {
      glue("- **What moves them?** On-promo **weekend** health share {fmt_pct(ps$weekend)} vs **weekday** {fmt_pct(ps$weekday)} (Δ {fmt_pp(ps$weekend - ps$weekday)}). ",
           "Prioritize Fri–Sun windows if weekend lift persists.")
    } else {
      "- **What moves them?** Use the *Promotions* section to identify timing/segment lift patterns."
    }
  } else {
    "- **What moves them?** Use the *Promotions* section to identify timing/segment lift patterns."
  }
}

# --- Coupons pre/post (from prepost_summary) ---
coupon_line <- {
  if (exists("prepost_summary") && nrow(prepost_summary) > 0) {
    post <- prepost_summary %>% filter(str_detect(window, "Post")) %>% pull(avg_health_share) %>% {.[1]}
    pre  <- prepost_summary %>% filter(str_detect(window, "Pre"))  %>% pull(avg_health_share) %>% {.[1]}
    if (is.na(post) || is.na(pre)) {
      "- **Coupons.** See the pre/post table in *Coupons*; if missing, re-run that chunk."
    } else {
      glue("- **Coupons.** Health-coupon redeemers average {fmt_pct(post)} **post** vs {fmt_pct(pre)} **pre** (±4w; Δ {fmt_pp(post - pre)}). ",
           "Broad couponing shows limited mix shift—favor targeting/bundles.")
    }
  } else {
    "- **Coupons.** See the pre/post table in *Coupons*; if missing, re-run that chunk."
  }
}

# --- Scope (context) ---
scope_line <- tryCatch({
  date_min <- suppressWarnings(min(trans$date, na.rm = TRUE)); date_max <- suppressWarnings(max(trans$date, na.rm = TRUE))
  n_txn <- tryCatch(nrow(trans), error = function(e) NA_integer_)
  n_prod <- tryCatch(nrow(prods), error = function(e) NA_integer_)
  n_hh   <- tryCatch(nrow(house), error = function(e) NA_integer_)
  glue("- **Scope.** {format(date_min, '%Y-%m-%d')} to {format(date_max, '%Y-%m-%d')}; ",
       "transactions: {comma(n_txn)}, products: {comma(n_prod)}, households: {comma(n_hh)}.")
}, error = function(e) NULL)

# --- Compose markdown and emit as asis output ---
md <- paste0(
  "**Executive summary & recommendations**\n\n",
  glue("- **Who’s growing the fastest?** {top_txt}."), "\n\n",
  promo_line, "\n\n",
  coupon_line, "\n\n",
  "- **What to do (targeted plan):** 1) Target the top-growth segments with **Fri–Sun** promos on health staples; ",
  "2) Replace broad coupons with **bundled offers** (e.g., healthy breakfast basket) and track **basket health-share**; ",
  "3) Stagger tests by segment and daypart; scale winners.\n\n",
  "- **Limitations.** Text tagging & taxonomy granularity may misclassify edge items; trend ≠ causation; ",
  "pre/post can mask heterogeneity. Run randomized A/B or geo tests (household × week) for incremental lift.\n\n",
  if (!is.null(scope_line)) paste0(scope_line, "\n") else ""
)

knitr::asis_output(md)

Executive summary & recommendations

Who’s growing the fastest? High / <35 / 1-2 / Kids in HH (slope ≈ 0.42%-pts/period); Low / <35 / 1-2 / No kids (slope ≈ 0.1%-pts/period); Low / 55+ / 3-4 / Kids in HH (slope ≈ 0.08%-pts/period).
What moves them? Use the weekday/weekend + segment facets in Promotions to time offers (no valid on/off promo comparison in aggregate).
Coupons. Health-coupon redeemers average 9.04% post vs 9.09% pre (±4w; Δ -0.06 pp). Broad couponing shows limited mix shift—favor targeting/bundles.
What to do (targeted plan): 1) Target the top-growth segments with Fri–Sun promos on health staples; 2) Replace broad coupons with bundled offers (e.g., healthy breakfast basket) and track basket health-share; 3) Stagger tests by segment and daypart; scale winners.
Limitations. Text tagging & taxonomy granularity may misclassify edge items; trend ≠ causation; pre/post can mask heterogeneity. Run randomized A/B or geo tests (household × week) for incremental lift.
Scope. 2017-01-01 to 2018-01-01; transactions: 1,469,307, products: 92,331, households: 801.

if (exists("trend_tbl") && is.data.frame(trend_tbl) && nrow(trend_tbl) > 0) {
  top3_tbl <- trend_tbl %>%
    mutate(Segment = paste(income_band, age_band, hh_size_band, has_kids, sep = " / ")) %>%
    arrange(desc(slope_per_period)) %>%
    slice_head(n = 3) %>%
    transmute(
      `Segment (income / age / HH size / kids)` = Segment,
      `Avg health share` = scales::percent(avg_share, accuracy = 0.01),
      `Trend slope (per period)` = paste0(round(100*slope_per_period, 2), "%-pts"),
      `R²` = round(r2, 2),
      Periods = n_periods
    )

  # Use kable for a minimal, readable table
  knitr::kable(top3_tbl, caption = "Top segments by health-share growth", align = c("l","r","r","r","r"))
} else {
  cat("_Top segments table unavailable (run the growth-metrics chunk to build `trend_tbl`)._\n")
}

Top segments by health-share growth
Segment (income / age / HH size / kids)	Avg health share	Trend slope (per period)	R²	Periods
High / <35 / 1-2 / Kids in HH	4.31%	0.42%-pts	0.10	13
Low / <35 / 1-2 / No kids	8.50%	0.1%-pts	0.06	54
Low / 55+ / 3-4 / Kids in HH	13.04%	0.08%-pts	0.04	53

12 Appendix: helpers

fmt_pct <- function(x, d=1) scales::percent(x, accuracy = 1/10^d)
fmt_dol <- function(x) scales::dollar(x, accuracy = 1)

Final Project

Tynan Sander

2025-10-06