# ---- Packages ----
req <- c("tidyverse","lubridate","scales","forcats","completejourney","tidyr","readr")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if (length(to_install)) install.packages(to_install)
invisible(lapply(req, library, character.only = TRUE))
# ---- Windows UTF-8 safety ----
try(Sys.setlocale(category = "LC_CTYPE", locale = "English_United States.utf8"), silent = TRUE)
## [1] "English_United States.utf8"
options(encoding = "native.enc")
knitr::opts_chunk$set(dev = "png")
# ---- Data (samples for fast knit) ----
transactions <- transactions_sample
promotions <- promotions_sample
products <- products
demographics <- demographics
# ---- Helpers ----
fmt_dollars <- function(x) scales::dollar(x, accuracy = 1)
fmt_pct <- function(x) scales::percent(x, accuracy = 0.1)
ascii <- function(s) iconv(s, from = "", to = "ASCII//TRANSLIT")
set.seed(42)
Plot 1 - Top 5 spend categories within each income bracket
inc_cat_spend <- transactions %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id") %>%
filter(!is.na(income), !is.na(product_category)) %>%
group_by(income, product_category) %>%
summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
group_by(income) %>%
slice_max(order_by = total_sales, n = 5, with_ties = FALSE) %>%
ungroup() %>%
arrange(income, desc(total_sales))
ggplot(inc_cat_spend,
aes(x = total_sales, y = fct_reorder(product_category, total_sales), fill = income)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ income, scales = "free_y") +
scale_x_continuous(labels = fmt_dollars) +
labs(
title = ascii("Household Spend Differs by Income Bracket"),
subtitle = ascii("Top 5 product categories by total sales value within each income group (Complete Journey sample data)"),
x = ascii("Total spend (USD)"),
y = ascii("Product category"),
caption = ascii("Data: transactions_sample + products + demographics. Bars show aggregated sales_value per income bracket.")
) +
theme_minimal(base_size = 12)

Plot 2 - Do displays lift sales? (category-level lift when displayed
vs not)
# 1) Weekly sales per product x store x week
trx_week <- transactions %>%
group_by(product_id, store_id, week) %>%
summarise(sales = sum(sales_value, na.rm = TRUE), .groups = "drop")
# 2) Promo marker per product x store x week (factor-safe conversion)
promo_flag <- promotions %>%
mutate(
display_num = suppressWarnings(readr::parse_number(as.character(display_location))),
display_num = replace_na(display_num, 0)
) %>%
group_by(product_id, store_id, week) %>%
summarise(any_display = as.integer(any(display_num > 0, na.rm = TRUE)), .groups = "drop")
# 3) Left-join so weeks without a promo row become "No Display"
trx_promo <- trx_week %>%
left_join(promo_flag, by = c("product_id","store_id","week")) %>%
mutate(on_display = if_else(replace_na(any_display, 0) > 0, "Display", "No Display"))
# 4) Category totals by display vs no display
promo_sales <- trx_promo %>%
inner_join(products %>% select(product_id, product_category), by = "product_id") %>%
group_by(product_category, on_display) %>%
summarise(category_sales = sum(sales, na.rm = TRUE), .groups = "drop") %>%
complete(product_category,
on_display = c("No Display","Display"),
fill = list(category_sales = 0))
# 5) Relative lift and top 15
lift_by_cat <- promo_sales %>%
pivot_wider(names_from = on_display, values_from = category_sales, values_fill = 0) %>%
mutate(
base = `No Display`,
disp = `Display`,
lift_pct = if_else(base > 0, (disp - base) / base, NA_real_)
) %>%
filter(!is.na(lift_pct)) %>%
arrange(desc(lift_pct)) %>%
slice_head(n = 15)
ggplot(lift_by_cat, aes(x = lift_pct, y = forcats::fct_reorder(product_category, lift_pct))) +
geom_segment(aes(x = 0, xend = lift_pct, y = product_category, yend = product_category)) +
geom_point(size = 3) +
scale_x_continuous(labels = fmt_pct) +
labs(
title = ascii("Displays Are Associated With Higher Category Sales"),
subtitle = ascii("Top 15 categories by relative sales lift when displayed vs not"),
x = ascii("Relative lift"),
y = ascii("Product category"),
caption = ascii("Data: transactions + promotions + products (Complete Journey)")
) +
theme_minimal(base_size = 12)

Plot 3 - When do bigger baskets happen? (avg basket dollars by
household comp x day of week)
basket_level <- transactions %>%
group_by(household_id, basket_id) %>%
summarise(
transaction_timestamp = min(transaction_timestamp, na.rm = TRUE),
basket_sales = sum(sales_value, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(wday = lubridate::wday(transaction_timestamp, label = TRUE, abbr = TRUE, week_start = 1)) %>%
inner_join(demographics %>% select(household_id, household_comp), by = "household_id") %>%
filter(!is.na(household_comp))
heat_basket <- basket_level %>%
group_by(household_comp, wday) %>%
summarise(avg_basket = mean(basket_sales, na.rm = TRUE), n = n(), .groups = "drop")
ggplot(heat_basket, aes(x = wday, y = fct_reorder(household_comp, -avg_basket), fill = avg_basket)) +
geom_tile(color = "white") +
geom_text(aes(label = fmt_dollars(avg_basket)), size = 3) +
scale_fill_gradient(low = "#f0f9e8", high = "#08589e", guide = "none") +
labs(
title = ascii("Bigger Baskets Cluster On Weekends And Differ By Household Composition"),
subtitle = ascii("Average basket spend by household composition x day of week (labels show average dollars per basket)"),
x = ascii("Day of week"),
y = ascii("Household composition"),
caption = ascii("Data: transactions_sample + demographics. Basket = sum(sales_value) per basket_id.")
) +
theme_minimal(base_size = 12)
