# Load all required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forcats)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(lubridate)
library(ggalluvial)
library(tidytext)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(grid)
library(stringr)
library(ggrepel)
library(ggnewscale)
library(patchwork)
# Load datasets
transactions <- get_transactions()
products <- products
demographics <- demographics
coupons <- coupons
campaigns <- campaigns
coupon_redemptions <- coupon_redemptions
# ---- Normalize products columns ----
cat_candidates <- intersect(c("category", "product_category", "commodity", "subcategory"), names(products))
if (length(cat_candidates) == 0) {
stop("No category-like column found in `products`. Available columns: ", paste(names(products), collapse = ", "))
}
cat_col <- cat_candidates[1]
products_dim <- products %>%
setNames(replace(names(.), names(.) == cat_col, "category")) %>%
dplyr::select(product_id,
dplyr::any_of(c("department")), # department exists in all CRAN versions
dplyr::any_of(c("brand")),
category)
knitr::opts_chunk$set(fig.width = 16, fig.height = 10, dpi = 200)
# SEASONALITY: monthly spend per department
# 1) Build monthly spend and drop January
monthly_dept <- transactions %>%
mutate(
date = as.Date(transaction_timestamp),
month = lubridate::floor_date(date, "month"),
sales = coalesce(sales_value, 0),
mon_num = lubridate::month(month)
) %>%
filter(mon_num != 1) %>% # <-- remove January
inner_join(products_dim %>% select(product_id, department), by = "product_id") %>%
filter(!is.na(department), !is.na(month)) %>%
group_by(department, month) %>%
summarise(sales = sum(sales), .groups = "drop") %>%
mutate(
mon = lubridate::month(month, label = TRUE, abbr = TRUE),
# Ensure months are ordered Feb..Dec after removing Jan
mon = factor(as.character(mon), levels = month.abb[2:12])
)
# 2) Choose departments to display
top_depts <- monthly_dept %>%
group_by(department) %>% summarise(total_sales = sum(sales), .groups = "drop") %>%
slice_max(total_sales, n = 4, with_ties = FALSE) %>%
pull(department)
heat <- monthly_dept %>%
filter(department %in% top_depts) %>%
mutate(department = forcats::fct_reorder(department, -sales, .fun = sum))
# 3) Fixed brand colors (edit as desired)
dept_color_map <- c(
"GROCERY" = "#1f77b4", # blue
"GROCERIES" = "#1f77b4",
"DRUG GM" = "#2ca02c", # green
"DRUG" = "#2ca02c",
"PRODUCE" = "#ff7f0e", # orange
"DELI" = "#9467bd" # purple
)
fallback_pairs <- list(
c("#f1eef6", "#756bb1"),
c("#fee0d2", "#e6550d"),
c("#e5f5e0", "#31a354"),
c("#deebf7", "#3182bd")
)
get_low_high <- function(dname, idx) {
key <- toupper(dname)
hi <- dept_color_map[key]
if (is.na(hi)) {
pair <- fallback_pairs[[ ((idx - 1) %% length(fallback_pairs)) + 1 ]]
list(low = pair[1], high = pair[2])
} else {
list(low = "#ffffff", high = unname(hi)) # white -> brand color
}
}
# 4) One panel per department (legend next to its own row), sqrt for stronger contrast
make_dept_panel <- function(dname, idx) {
d <- dplyr::filter(heat, department == dname)
cols <- get_low_high(dname, idx)
ggplot(d, aes(x = mon, y = department, fill = sales)) +
geom_tile(width = 0.95, height = 0.9) +
scale_fill_gradient(
low = cols$low, high = cols$high,
labels = scales::dollar, name = paste0(dname, " monthly spend"),
trans = "sqrt", oob = scales::squish
) +
labs(x = "Month", y = NULL) +
theme_minimal(base_size = 16) +
theme(
panel.grid = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_text(face = "bold"),
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 11),
plot.margin = margin(t = 2, r = 10, b = 2, l = 2)
)
}
dept_levels <- levels(forcats::fct_drop(heat$department))
panels <- Map(make_dept_panel, dept_levels, seq_along(dept_levels))
library(patchwork)
wrap_plots(panels, ncol = 1, guides = "keep") +
plot_annotation(
title = "Seasonality by Department: Actual Monthly Spend (January removed)",
subtitle = "Each row is a department with its own color ramp and legend. January is excluded to prevent zero-spend from compressing the scale.",
caption = "Data: completejourney | Joins: transactions + products | Metric: sum of sales_value per department per month (Feb–Dec)",
theme = theme(
plot.title = element_text(face = "bold", size = 24),
plot.subtitle = element_text(size = 14, margin = margin(b = 10)),
plot.caption = element_text(size = 11)
)
)

# WEEKDAY VS WEEKEND: Basket value by household size (with legend and labels)
# Build basket-level spend and weekend flag
baskets <- transactions %>%
mutate(
sales = coalesce(sales_value, 0),
ts = suppressWarnings(lubridate::ymd_hms(transaction_timestamp))
) %>%
group_by(basket_id, household_id) %>%
summarise(
basket_sales = sum(sales),
first_ts = suppressWarnings(min(ts)),
.groups = "drop"
) %>%
filter(!is.na(first_ts)) %>%
mutate(
wday_abbr = as.character(lubridate::wday(first_ts, label = TRUE, abbr = TRUE)),
day_type = if_else(wday_abbr %in% c("Sat","Sun"), "Weekend", "Weekday")
) %>%
inner_join(demographics %>% select(household_id, household_size), by = "household_id") %>%
filter(!is.na(household_size))
# Average basket by household_size x day_type
avg_by_day <- baskets %>%
group_by(household_size, day_type) %>%
summarise(avg_basket = mean(basket_sales, na.rm = TRUE), .groups = "drop") %>%
mutate(household_size = factor(household_size, ordered = TRUE))
# Wide and gap (for ordering), then long for explicit endpoint colors/legend
avg_wide <- avg_by_day %>%
tidyr::pivot_wider(names_from = day_type, values_from = avg_basket) %>%
mutate(gap = Weekend - Weekday)
avg_long <- avg_wide %>%
select(household_size, Weekday, Weekend) %>%
tidyr::pivot_longer(cols = c("Weekday","Weekend"),
names_to = "End", values_to = "Value")
# Dumbbell: segment per row, colored endpoints with legend
max_x <- max(avg_long$Value, na.rm = TRUE)
ggplot() +
geom_segment(
data = avg_wide,
aes(x = Weekday, xend = Weekend, y = household_size, yend = household_size),
linewidth = 3, color = "grey85", na.rm = TRUE
) +
# endpoints
geom_point(
data = avg_long,
aes(x = Value, y = household_size, color = End),
size = 4, na.rm = TRUE
) +
# gap label on the right
geom_text(
data = avg_wide,
aes(x = pmax(Weekday, Weekend, na.rm = TRUE),
y = household_size,
label = ifelse(is.finite(gap), scales::dollar(gap), "")),
nudge_x = 0.02 * ifelse(is.finite(max_x), max_x, 0),
size = 4, color = "grey20", na.rm = TRUE
) +
scale_color_manual(values = c(Weekday = "#1f77b4", Weekend = "#ff7f0e"),
name = "Endpoint") +
scale_x_continuous(labels = scales::dollar) +
labs(
title = "Weekend Baskets Are Larger For Most Household Sizes",
subtitle = "Dumbbell shows average basket value (USD) by household size on Weekdays (blue, left) and Weekends (orange, right). Right-side labels show Weekend minus Weekday.",
x = "Average Basket Value (USD)", y = "Household Size",
caption = "Data: completejourney | Joins: transactions + demographics | Basket = sum of line item sales_value per basket_id"
) +
theme_minimal(base_size = 16) +
theme(
panel.grid.major.y = element_blank(),
plot.title = element_text(face = "bold", size = 24),
plot.subtitle = element_text(size = 14, margin = margin(b = 10)),
plot.caption = element_text(size = 11),
legend.position = "top",
legend.title = element_text(size = 12),
legend.text = element_text(size = 11)
)

# ---- Top-5 Categories by Income (excluding coupons/misc) ----
excluded_departments <- c(
"COUPON/MISC ITEMS", "COUPONS", "MISC", "MISCELLANEOUS", "UNKNOWN"
)
excluded_category_regex <- stringr::regex(
"\\bcoupon\\b|\\bmisc\\b|miscellaneous|unknown", ignore_case = TRUE
)
df_income_topcat <- transactions %>%
mutate(sales = coalesce(sales_value, 0)) %>%
inner_join(products_dim %>% select(product_id, category, department), by = "product_id") %>%
inner_join(demographics %>% select(household_id, income), by = "household_id") %>%
filter(
!is.na(income), !is.na(category),
!(toupper(department) %in% toupper(excluded_departments)),
!stringr::str_detect(category, excluded_category_regex)
) %>%
group_by(income, category) %>%
summarise(total_sales = sum(sales), .groups = "drop") %>%
filter(total_sales > 0) %>%
group_by(income) %>%
slice_max(total_sales, n = 5, with_ties = FALSE) %>%
ungroup() %>%
mutate(
category_wrapped = stringr::str_wrap(category, 18),
cat_within = tidytext::reorder_within(category_wrapped, total_sales, income),
sales_lab = scales::dollar(total_sales)
) %>%
group_by(income) %>%
mutate(
grp_max = max(total_sales, na.rm = TRUE),
place_inside = total_sales >= 0.90 * grp_max, # label inside long bars
hjust_lab = ifelse(place_inside, 1.02, -0.05),
color_lab = ifelse(place_inside, "white", "grey20")
) %>%
ungroup()
ggplot(df_income_topcat, aes(cat_within, total_sales)) +
geom_col(fill = "#2c7fb8", width = 0.7) +
ggrepel::geom_text_repel(
aes(label = sales_lab, hjust = hjust_lab, color = color_lab),
size = 3.3,
direction = "y",
min.segment.length = 0, segment.color = NA,
force = 1, box.padding = 0.15, point.padding = 0.15, max.overlaps = Inf
) +
facet_wrap(~ income, scales = "free_y", ncol = 3) +
coord_flip(clip = "off") +
tidytext::scale_x_reordered() +
scale_y_continuous(labels = scales::dollar, expand = expansion(mult = c(0.02, 0.30))) +
scale_color_identity() +
labs(
title = "What Each Income Group Actually Buys (Coupons/Misc Removed)",
subtitle = paste0(
"Each panel shows the top five product categories by total spend within that income band. ",
"Coupon and miscellaneous items are excluded to focus on real product demand."
),
x = NULL, y = "Total Spend (USD)",
caption = "Source: completejourney | Built from transactions + products + demographics | Display: top 5 per income"
) +
theme_minimal(base_size = 13) +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 12, margin = margin(b = 6)),
strip.text = element_text(face = "bold"),
plot.caption = element_text(color = "grey40"),
panel.spacing = unit(10, "pt")
)
