# ---- Packages ----
req <- c("tidyverse","lubridate","scales","forcats","completejourney","tidyr","readr")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if (length(to_install)) install.packages(to_install)
invisible(lapply(req, library, character.only = TRUE))

# ---- Windows UTF-8 safety ----
try(Sys.setlocale(category = "LC_CTYPE", locale = "English_United States.utf8"), silent = TRUE)
## [1] "English_United States.utf8"
options(encoding = "native.enc")
knitr::opts_chunk$set(dev = "png")

# ---- Data (samples for fast knit) ----
transactions <- transactions_sample
promotions   <- promotions_sample
products     <- products
demographics <- demographics

# ---- Helpers ----
fmt_dollars <- function(x) scales::dollar(x, accuracy = 1)
fmt_pct     <- function(x) scales::percent(x, accuracy = 0.1)
ascii <- function(s) iconv(s, from = "", to = "ASCII//TRANSLIT")
set.seed(42)

Plot 1 - Top 5 spend categories within each income bracket

inc_cat_spend <- transactions %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id") %>%
  filter(!is.na(income), !is.na(product_category)) %>%
  group_by(income, product_category) %>%
  summarise(total_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  group_by(income) %>%
  slice_max(order_by = total_sales, n = 5, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(income, desc(total_sales))

ggplot(inc_cat_spend,
       aes(x = total_sales, y = fct_reorder(product_category, total_sales), fill = income)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ income, scales = "free_y") +
  scale_x_continuous(labels = fmt_dollars) +
  labs(
    title    = ascii("Household Spend Differs by Income Bracket"),
    subtitle = ascii("Top 5 product categories by total sales value within each income group (Complete Journey sample data)"),
    x        = ascii("Total spend (USD)"),
    y        = ascii("Product category"),
    caption  = ascii("Data: transactions_sample + products + demographics. Bars show aggregated sales_value per income bracket.")
  ) +
  theme_minimal(base_size = 12)

Plot 2 - Do displays lift sales? (category-level lift when displayed vs not)

# 1) Weekly sales per product x store x week
trx_week <- transactions %>%
  group_by(product_id, store_id, week) %>%
  summarise(sales = sum(sales_value, na.rm = TRUE), .groups = "drop")

# 2) Promo marker per product x store x week (factor-safe conversion)
promo_flag <- promotions %>%
  mutate(
    display_num = suppressWarnings(readr::parse_number(as.character(display_location))),
    display_num = replace_na(display_num, 0)
  ) %>%
  group_by(product_id, store_id, week) %>%
  summarise(any_display = as.integer(any(display_num > 0, na.rm = TRUE)), .groups = "drop")

# 3) Left-join so weeks without a promo row become "No Display"
trx_promo <- trx_week %>%
  left_join(promo_flag, by = c("product_id","store_id","week")) %>%
  mutate(on_display = if_else(replace_na(any_display, 0) > 0, "Display", "No Display"))

# 4) Category totals by display vs no display
promo_sales <- trx_promo %>%
  inner_join(products %>% select(product_id, product_category), by = "product_id") %>%
  group_by(product_category, on_display) %>%
  summarise(category_sales = sum(sales, na.rm = TRUE), .groups = "drop") %>%
  complete(product_category,
           on_display = c("No Display","Display"),
           fill = list(category_sales = 0))

# 5) Relative lift and top 15
lift_by_cat <- promo_sales %>%
  pivot_wider(names_from = on_display, values_from = category_sales, values_fill = 0) %>%
  mutate(
    base = `No Display`,
    disp = `Display`,
    lift_pct = if_else(base > 0, (disp - base) / base, NA_real_)
  ) %>%
  filter(!is.na(lift_pct)) %>%
  arrange(desc(lift_pct)) %>%
  slice_head(n = 15)

ggplot(lift_by_cat, aes(x = lift_pct, y = forcats::fct_reorder(product_category, lift_pct))) +
  geom_segment(aes(x = 0, xend = lift_pct, y = product_category, yend = product_category)) +
  geom_point(size = 3) +
  scale_x_continuous(labels = fmt_pct) +
  labs(
    title    = ascii("Displays Are Associated With Higher Category Sales"),
    subtitle = ascii("Top 15 categories by relative sales lift when displayed vs not"),
    x        = ascii("Relative lift"),
    y        = ascii("Product category"),
    caption  = ascii("Data: transactions + promotions + products (Complete Journey)")
  ) +
  theme_minimal(base_size = 12)

Plot 3 - When do bigger baskets happen? (avg basket dollars by household comp x day of week)

basket_level <- transactions %>%
  group_by(household_id, basket_id) %>%
  summarise(
    transaction_timestamp = min(transaction_timestamp, na.rm = TRUE),
    basket_sales = sum(sales_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(wday = lubridate::wday(transaction_timestamp, label = TRUE, abbr = TRUE, week_start = 1)) %>%
  inner_join(demographics %>% select(household_id, household_comp), by = "household_id") %>%
  filter(!is.na(household_comp))

heat_basket <- basket_level %>%
  group_by(household_comp, wday) %>%
  summarise(avg_basket = mean(basket_sales, na.rm = TRUE), n = n(), .groups = "drop")

ggplot(heat_basket, aes(x = wday, y = fct_reorder(household_comp, -avg_basket), fill = avg_basket)) +
  geom_tile(color = "white") +
  geom_text(aes(label = fmt_dollars(avg_basket)), size = 3) +
  scale_fill_gradient(low = "#f0f9e8", high = "#08589e", guide = "none") +
  labs(
    title    = ascii("Bigger Baskets Cluster On Weekends And Differ By Household Composition"),
    subtitle = ascii("Average basket spend by household composition x day of week (labels show average dollars per basket)"),
    x        = ascii("Day of week"),
    y        = ascii("Household composition"),
    caption  = ascii("Data: transactions_sample + demographics. Basket = sum(sales_value) per basket_id.")
  ) +
  theme_minimal(base_size = 12)